# Load libraries and set seed
library(readr)
library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(ggplot2)
library(corrplot)
## corrplot 0.92 loaded
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
library(reshape2)
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(dbscan)
## 
## Attaching package: 'dbscan'
## 
## The following object is masked from 'package:stats':
## 
##     as.dendrogram
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
set.seed(123)

Set project path and load dataset

# Get the name of working directory in RProject
proj.path <- getwd()
hotspots_raw <- read_csv(file.path(proj.path,'hotspots.csv'))
## Rows: 1781266 Columns: 39
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (5): source, sensor, satellite, agency, fuel
## dbl  (33): lat, lon, uid, temp, rh, ws, wd, pcp, ffmc, dmc, dc, isi, bui, fw...
## dttm  (1): rep_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Convert date format

# Check the date format
hotspots_raw$rep_date <- as.POSIXct(hotspots_raw$rep_date, format = "%Y-%m-%d %H:%M:%S")
str(hotspots_raw$rep_date)
##  POSIXct[1:1781266], format: "2013-05-08 22:44:00" "2013-05-06 21:10:00" "2013-05-01 19:05:00" ...
# Add year and month columns
hotspots_raw$year <- year(hotspots_raw$rep_date)
hotspots_raw$month <- month(hotspots_raw$rep_date, label = TRUE, abbr = TRUE)  # Months as factor (and named)

Subset data from period 2014 - 2023 (10 years)

# Subset the dataframe to include only data from 2014 to the end of 2023
hotspots <- hotspots_raw %>%
  filter(year >= 2014 & year <= 2023)

# Check range of date
range(hotspots$rep_date) # 10 years period for further observations
## [1] "2014-04-04 18:50:00 UTC" "2023-12-31 19:14:00 UTC"

To identify specific fire events, apply DBSCAN (Density-Based Spatial Clustering of Application with Noise) function and classify every data point into cluster. The number of clusters has to be close to the official information from BC Website (https://www2.gov.bc.ca/gov/content/safety/wildfire-status/about-bcws/wildfire-statistics/wildfire-averages)

# Prepare the data for clustering based on latitude, longitude, date
event_data <- hotspots %>%
  select(lat, lon, rep_date)

# Convert date to numeric for clustering
event_data$date_numeric <- as.numeric(as.POSIXct(event_data$rep_date))

# View event_data
head(event_data)
## # A tibble: 6 × 4
##     lat   lon rep_date            date_numeric
##   <dbl> <dbl> <dttm>                     <dbl>
## 1  53.3 -125. 2014-08-06 05:11:00   1407301860
## 2  56.1 -124. 2014-08-14 10:14:00   1408011240
## 3  55.3 -122. 2014-08-17 18:58:00   1408301880
## 4  56.4 -125. 2014-08-05 11:09:00   1407236940
## 5  56.2 -125. 2014-07-16 11:29:00   1405510140
## 6  53.4 -125. 2014-08-11 06:05:00   1407737100

Scale the converted date in numeric to have similar range as latitude / longitude. 1 degree of coordinate is ~ 111 km and 20 hours ~ 72000 seconds. Add new column named event_cluster to the hotspots dataset

# Scale factor : 1 second ~ 0.001 degrees (approximately)
event_data$date_scaled <- event_data$date_numeric * 0.001

# Apply DBSCAN clustering
db <- dbscan(event_data[, c("lat", "lon", "date_scaled")], eps = 0.6, minPts = 5)
# eps value of 0.6 degrees is about 66 km

# Add cluster labels to the original dataset
hotspots$event_cluster <- db$cluster

Preview single specific event for observation

# Filter for a specific cluster (event of fire) for sample
event_500 <- hotspots %>%
  filter(event_cluster == 500)# Replace with the specific cluster number

event_500
## # A tibble: 14 × 42
##      lat   lon rep_date                uid source sensor satellite agency  temp
##    <dbl> <dbl> <dttm>                <dbl> <chr>  <chr>  <chr>     <chr>  <dbl>
##  1  53.2 -124. 2014-07-16 05:37:00 4074377 NOAA   AVHRR  METOP-B   BC      25.5
##  2  53.1 -124. 2014-07-16 05:37:00 4074375 NOAA   AVHRR  METOP-B   BC      26.1
##  3  53.1 -124. 2014-07-16 05:37:00 4074372 NOAA   AVHRR  METOP-B   BC      25.5
##  4  53.1 -124. 2014-07-16 05:37:00 4074371 NOAA   AVHRR  METOP-B   BC      25.5
##  5  53.2 -124. 2014-07-16 05:37:00 4074380 NOAA   AVHRR  METOP-B   BC      25.5
##  6  53.1 -124. 2014-07-16 05:37:00 4074370 NOAA   AVHRR  METOP-B   BC      25.5
##  7  53.1 -124. 2014-07-16 05:37:00 4074373 NOAA   AVHRR  METOP-B   BC      25.5
##  8  53.2 -124. 2014-07-16 05:37:00 4074382 NOAA   AVHRR  METOP-B   BC      25.5
##  9  53.2 -124. 2014-07-16 05:37:00 4074381 NOAA   AVHRR  METOP-B   BC      25.5
## 10  53.2 -124. 2014-07-16 05:37:00 4074379 NOAA   AVHRR  METOP-B   BC      25.5
## 11  53.1 -124. 2014-07-16 05:37:00 4074378 NOAA   AVHRR  METOP-B   BC      26.1
## 12  53.2 -124. 2014-07-16 05:37:00 4074383 NOAA   AVHRR  METOP-B   BC      26.1
## 13  53.2 -124. 2014-07-16 05:37:00 4074376 NOAA   AVHRR  METOP-B   BC      25.5
## 14  53.1 -124. 2014-07-16 05:37:00 4074374 NOAA   AVHRR  METOP-B   BC      25.5
## # ℹ 33 more variables: rh <dbl>, ws <dbl>, wd <dbl>, pcp <dbl>, ffmc <dbl>,
## #   dmc <dbl>, dc <dbl>, isi <dbl>, bui <dbl>, fwi <dbl>, fuel <chr>,
## #   ros <dbl>, sfc <dbl>, tfc <dbl>, bfc <dbl>, hfi <dbl>, cfb <dbl>,
## #   age <dbl>, estarea <dbl>, polyid <dbl>, pcuring <dbl>, cfactor <dbl>,
## #   greenup <dbl>, elev <dbl>, cfl <dbl>, tfc0 <dbl>, sfl <dbl>, ecozone <dbl>,
## #   sfc0 <dbl>, cbh <dbl>, year <dbl>, month <ord>, event_cluster <int>
# OUTPUT 14 observations lat 53 lon -124 on 16 July 2014

Count the events without noise included

# Count the number of unique clusters
num_clusters <- length(unique(hotspots$event_cluster))-1 # Subtract 1 to exclude the noise cluster (label 0)
num_clusters
## [1] 32309
# Total number of fires in BC (official table)
2293+1801+1647+670+825+2117+1353+1050+1858+1481+1861
## [1] 16956
# 16956

Summary of the clusters

# Detailed summary of the clusters
event_details <- hotspots %>%
  filter(event_cluster != 0) %>%
  group_by(year) %>%
  summarise(
    first_cluster = first(event_cluster),
    start_date_hotspot = min(rep_date),
    end_date_hotspot = max(rep_date),
    events_count = length(unique(event_cluster))
  )
print(event_details)
## # A tibble: 10 × 5
##     year first_cluster start_date_hotspot  end_date_hotspot    events_count
##    <dbl>         <int> <dttm>              <dttm>                     <int>
##  1  2014             1 2014-04-12 21:30:00 2014-12-07 05:10:00         1786
##  2  2015          1787 2015-03-09 21:10:00 2015-11-22 21:23:00         2035
##  3  2016          3822 2016-03-25 09:27:00 2016-12-20 22:36:00          636
##  4  2017          4458 2017-03-19 20:16:00 2017-11-14 20:14:00         4293
##  5  2018          8751 2018-02-01 03:50:00 2018-11-07 06:09:00         5433
##  6  2019         14184 2019-03-13 04:07:00 2019-11-19 18:48:00         1792
##  7  2020         15976 2020-03-20 04:12:00 2020-12-08 04:45:00          699
##  8  2021         16675 2021-01-06 18:42:00 2021-12-11 05:33:00         4437
##  9  2022         21112 2022-01-12 04:44:00 2022-12-31 06:05:00         2035
## 10  2023         23147 2023-01-05 05:22:00 2023-12-31 05:22:00         9163
# First_cluster column : Number of cluster started from 1 and increase based on the following fire cluster and grouped by year.
# Start_date_hotspot : Start date of fire event captured by hotspot
# End_date_hotspot : End date of fire event captured by hotspot
# Events_count : Total number of fire event in yearly basis based on cluster.

Analyse the fire events in graphs.

# In order to identify the time period to focus on and further yearly basis comparison purpose, summarize the number of cluster per year and month (exclude number of noise)

# Summarize the number of fire events per year and month
fire_events_per_month <- hotspots %>%
  filter(event_cluster != 0) %>%  # Exclude noise clusters
  group_by(year, month) %>%
  summarise(n_events = n()) %>%
  ungroup()
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
# Print the summary table
print(fire_events_per_month)
## # A tibble: 100 × 3
##     year month n_events
##    <dbl> <ord>    <int>
##  1  2014 Apr         14
##  2  2014 May         91
##  3  2014 Jun         47
##  4  2014 Jul       9326
##  5  2014 Aug      19065
##  6  2014 Sep       1447
##  7  2014 Oct       1401
##  8  2014 Nov       2539
##  9  2014 Dec         18
## 10  2015 Mar         87
## # ℹ 90 more rows
# Reshape the data to a wide format
fire_events_wide <- fire_events_per_month %>%
  pivot_wider(names_from = month, values_from = n_events, values_fill = 0)

# Print the wide format summary table
print(fire_events_wide)
## # A tibble: 10 × 13
##     year   Apr   May   Jun    Jul    Aug    Sep   Oct   Nov   Dec   Mar   Feb
##    <dbl> <int> <int> <int>  <int>  <int>  <int> <int> <int> <int> <int> <int>
##  1  2014    14    91    47   9326  19065   1447  1401  2539    18     0     0
##  2  2015    88  6717  4373  16408   3885    557  6534  2008     0    87     0
##  3  2016  1331  6286    22    104    930   1090  1095   964   142    35     0
##  4  2017    45   226   144  77980 149777  22174 11887   544     0    18     0
##  5  2018   118  1595  1183   5021 331426   8613  8091  2147     0    15     9
##  6  2019   137  1505   128    115   3431   2935 15821  3121     0    44     0
##  7  2020     0    65     0     29   3163   1880  3688   369    80    12     0
##  8  2021   277   124  2574 153891  62967    475  4708  2701   351   175    37
##  9  2022    12    16    76   1977   4239  22405 12036  3284   230    54    60
## 10  2023   273 42385 87720 146246 156871 234536  4187  4246   787    37   302
## # ℹ 1 more variable: Jan <int>

The table summary shows how the clustered fire events are counted in yearly basis per monthly views.

Distribution of the clustered fire events in bar plot.

# Create a bar plot to visualize the number of fire events per month
ggplot(fire_events_per_month, aes(x = month, y = n_events)) +
  geom_bar(stat = "identity", fill = "steelblue", color = "black") +
  labs(title = "Number of Fire Events per Month",
       x = "Month",
       y = "Number of Fire Events") +
  scale_y_continuous(labels = scales::comma) + 
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, size = 15),
        axis.title.x = element_text(size = 12),
        axis.title.y = element_text(size = 12),
        axis.text.x = element_text(size = 10),
        axis.text.y = element_text(size = 10))

The bar plot shows that the initial start of fire event is from May and significantly increased from July to September.

Visualize fire events with heatmap

# Define the color palette
colors <- c("gray95", "gray80", "yellow", "orange", "darkred", "black")

# Define the values for the color breaks
values <- c(0, 500, 1000, 2000, 10000, 350000)

# Create a heatmap to visualize the number of fire events per month and year
ggplot(fire_events_per_month, aes(x = month, y = factor(year), fill = n_events)) +
  geom_tile(color = "white") +
  scale_fill_gradientn(colors = colors, values = rescale(values), labels = scales::comma) +
  labs(title = "Number of Fire Events per Month and Year",
       x = "Month",
       y = "Year",
       fill = "Number of Events") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, size = 15),
        axis.title.x = element_text(size = 12),
        axis.title.y = element_text(size = 12),
        axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
        axis.text.y = element_text(size = 10))

From the heatmap, decided to concentrate from May to October for further analysis.

# Subset the data for the months from May to October
hotspots_peak <- hotspots %>%
  filter(month(rep_date) %in% c(5, 6, 7, 8, 9, 10))

# MAKE A HEATMAP
# Summarize the number of fire events per year and month
fire_events_per_month_subset <- hotspots_peak %>%
  filter(event_cluster != 0) %>%
  group_by(year, month) %>%
  summarise(n_events = n()) %>%
  ungroup()
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
# Print the summary table
print(fire_events_per_month_subset)
## # A tibble: 59 × 3
##     year month n_events
##    <dbl> <ord>    <int>
##  1  2014 May         91
##  2  2014 Jun         47
##  3  2014 Jul       9326
##  4  2014 Aug      19065
##  5  2014 Sep       1447
##  6  2014 Oct       1401
##  7  2015 May       6717
##  8  2015 Jun       4373
##  9  2015 Jul      16408
## 10  2015 Aug       3885
## # ℹ 49 more rows
# Reshape the data to a wide format
fire_events_wide2 <- fire_events_per_month_subset %>%
  pivot_wider(names_from = month, values_from = n_events, values_fill = 0)

# Print the wide format summary table
print(fire_events_wide2)
## # A tibble: 10 × 7
##     year   May   Jun    Jul    Aug    Sep   Oct
##    <dbl> <int> <int>  <int>  <int>  <int> <int>
##  1  2014    91    47   9326  19065   1447  1401
##  2  2015  6717  4373  16408   3885    557  6534
##  3  2016  6286    22    104    930   1090  1095
##  4  2017   226   144  77980 149777  22174 11887
##  5  2018  1595  1183   5021 331426   8613  8091
##  6  2019  1505   128    115   3431   2935 15821
##  7  2020    65     0     29   3163   1880  3688
##  8  2021   124  2574 153891  62967    475  4708
##  9  2022    16    76   1977   4239  22405 12036
## 10  2023 42385 87720 146246 156871 234536  4187

Visualize the peak fire events with heatmap

# Create a heatmap to visualize the number of fire events per month and year
ggplot(fire_events_per_month_subset, aes(x = month, y = factor(year), fill = n_events)) +
  geom_tile(color = "white") +
  scale_fill_gradientn(colors = colors, values = rescale(values), labels = scales::comma) +
  labs(title = "Number of Fire Events per Month and Year (Peak)",
       x = "Month",
       y = "Year",
       fill = "Number of Events") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, size = 15),
        axis.title.x = element_text(size = 12),
        axis.title.y = element_text(size = 12),
        axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
        axis.text.y = element_text(size = 10))

Final view of subset heatmap from May - October for further analysis.

Explore and Interpretation of data variables to determine further data concentration.

Coordinate (Latitude and Longitude)

range(hotspots$lat, na.rm = TRUE) #range is valid
## [1] 48.32883 60.00090
range(hotspots$lon, na.rm = TRUE) #range is valid
## [1] -138.328 -114.090
# British Columbia coordinate :
# Latitude: Between 48.309789°N and 60.00065°N
# Longitude: Between -139.058200 °W and -114.05423°W

Rep_date (Date range)

# Extract year and number of points from event_details
events_count <- event_details %>%
  select(year, events_count)

# Check the extracted table
print(events_count)
## # A tibble: 10 × 2
##     year events_count
##    <dbl>        <int>
##  1  2014         1786
##  2  2015         2035
##  3  2016          636
##  4  2017         4293
##  5  2018         5433
##  6  2019         1792
##  7  2020          699
##  8  2021         4437
##  9  2022         2035
## 10  2023         9163

The period of data covers 10 years period started from 2014 and included the major fire events in 2023.

Show the summary of hotspots dataset in days count per year

# Create summary table describing each hotspots dataset by year
hotspots_df_summary <- hotspots %>%
  group_by(year) %>%
  summarise(
    start_date = min(rep_date),
    end_date = max(rep_date),
    hotspots_df_day_count = as.numeric(difftime(max(rep_date), min(rep_date), units = "days"))
  )

hotspots_df_summary
## # A tibble: 10 × 4
##     year start_date          end_date            hotspots_df_day_count
##    <dbl> <dttm>              <dttm>                              <dbl>
##  1  2014 2014-04-04 18:50:00 2014-12-07 20:37:00                  247.
##  2  2015 2015-03-07 06:05:00 2015-11-22 21:23:00                  261.
##  3  2016 2016-03-25 04:10:00 2016-12-21 04:55:00                  271.
##  4  2017 2017-03-03 04:15:00 2017-11-15 11:41:00                  257.
##  5  2018 2018-01-06 03:11:00 2018-11-07 18:19:00                  306.
##  6  2019 2019-03-13 02:06:00 2019-11-19 18:54:00                  252.
##  7  2020 2020-03-03 03:45:00 2020-12-29 05:05:00                  301.
##  8  2021 2021-01-06 13:35:00 2021-12-11 18:36:00                  339.
##  9  2022 2022-01-11 13:22:00 2022-12-31 19:06:00                  354.
## 10  2023 2023-01-01 05:01:00 2023-12-31 19:14:00                  365.
# Create summary table describing each hotspots dataset by year
hotspots_df_summary <- hotspots_df_summary %>%
  left_join(events_count, by = "year") %>%
  mutate(
    start_month_day = format(as.Date(start_date), "%m-%d"),
    end_month_day = format(as.Date(end_date), "%m-%d")
  )
print(hotspots_df_summary)
## # A tibble: 10 × 7
##     year start_date          end_date            hotspots_df_day_count
##    <dbl> <dttm>              <dttm>                              <dbl>
##  1  2014 2014-04-04 18:50:00 2014-12-07 20:37:00                  247.
##  2  2015 2015-03-07 06:05:00 2015-11-22 21:23:00                  261.
##  3  2016 2016-03-25 04:10:00 2016-12-21 04:55:00                  271.
##  4  2017 2017-03-03 04:15:00 2017-11-15 11:41:00                  257.
##  5  2018 2018-01-06 03:11:00 2018-11-07 18:19:00                  306.
##  6  2019 2019-03-13 02:06:00 2019-11-19 18:54:00                  252.
##  7  2020 2020-03-03 03:45:00 2020-12-29 05:05:00                  301.
##  8  2021 2021-01-06 13:35:00 2021-12-11 18:36:00                  339.
##  9  2022 2022-01-11 13:22:00 2022-12-31 19:06:00                  354.
## 10  2023 2023-01-01 05:01:00 2023-12-31 19:14:00                  365.
## # ℹ 3 more variables: events_count <int>, start_month_day <chr>,
## #   end_month_day <chr>

This table shows the beginning and end day of data collected by yearly basis with number of fire event counts. The notable finding is started from 2021 to 2023, it shows the data recorded in full year round, making it has more data point reference and can increase accuracy of analysis.

UID (Identifier)

# Display the first 10 rows as preview
hotspots %>%
  select(uid) %>%
  head(10)  # Display the first 10 rows
## # A tibble: 10 × 1
##        uid
##      <dbl>
##  1 5097641
##  2 5349891
##  3 5470838
##  4 5048694
##  5 4073541
##  6 5198198
##  7 5303682
##  8 5537351
##  9 5296999
## 10 4073857
# The range of UID
range(hotspots$uid, na.rm = TRUE)
## [1]    31324 44237954
# The uid variable is present in historical datasets up to 2019 and absent for the rest of the year hence 40% value is missing.

Source (Source of hotspots)

hotspots %>% 
  select(source) %>% 
  count(source)
## # A tibble: 15 × 2
##    source        n
##    <chr>     <int>
##  1 HMS        4158
##  2 NASA     232234
##  3 NASA1      5013
##  4 NASA2    339423
##  5 NASA3    330797
##  6 NASA6      5091
##  7 NASA7      5410
##  8 NASA_can  12899
##  9 NASA_usa    435
## 10 NASAkmz      89
## 11 NASAwcan  49947
## 12 NASAwusa    143
## 13 NOAA     172479
## 14 UMD       28514
## 15 USFS     589933
sourse_cluster_count <- hotspots %>%
  group_by(source) %>%
  summarise(num_clusters = n_distinct(event_cluster))

# Create a bar plot of the number of clusters per satellite
ggplot(sourse_cluster_count, aes(x = source, y = num_clusters)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(title = "Number of Events Reported by Each Source",
       x = "Source",
       y = "Number of Events (clusters)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Sensor (Satellite sensor)

hotspots %>% 
  select(sensor) %>% 
  count(sensor)
## # A tibble: 8 × 2
##   sensor       n
##   <chr>    <int>
## 1 AVHRR   164520
## 2 IBAND   400710
## 3 Landsat   1014
## 4 MODIS   240827
## 5 OLI        299
## 6 VIIRS    94345
## 7 VIIRS-I 869363
## 8 VIIRS-M   5487
sensor_cluster_count <- hotspots %>%
  group_by(sensor) %>%
  summarise(num_clusters = n_distinct(event_cluster))

# Create a bar plot of the number of clusters per satellite
ggplot(sensor_cluster_count, aes(x = sensor, y = num_clusters)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(title = "Number of Events Reported by Each Sensor",
       x = "Sensor",
       y = "Number of Events (clusters)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

#Advanced Very High Resolution Radiometer (AVHRR) imagery, courtesy of the NOAA
#Moderate Resolution Imaging Spectroradiometer (MODIS) imagery, courtesy of the NASA
#Visible Infrared Imaging Radiometer Suite (VIIRS) imagery, courtesy of NASA LANCE FIRMS

Agency (indicate province)

hotspots %>% 
  select(agency) %>% 
  count(agency)
## # A tibble: 1 × 2
##   agency       n
##   <chr>    <int>
## 1 BC     1776565
# only output is BC as the main area of concentration

Variables with numerical columns

# Identify numerical columns to work with
numerical_columns <- c('temp',
                       'rh',
                       'ws',
                       'wd',
                       'pcp',
                       'ffmc',
                       'dmc',
                       'dc',
                       'isi',
                       'bui',
                       'fwi',
                       'ros',
                       'sfc',
                       'tfc',
                       'bfc',
                       'hfi',
                       'cfb',
                       'age',
                       'estarea',
                       'pcuring',
                       'cfactor',
                       'greenup',
                       'elev',
                       'cfl',
                       'tfc0',
                       'sfl',
                       'ecozone',
                       'sfc0',
                       'cbh')

Statistical summary of numerical variables in hotspots dataset.

# Function to describe each numerical column 
describe_numerical <- function(df, cols) {
  summary_list <- list()
  
  for (col in cols) {
    summary_stats <- data.frame(
      Variable = col,
      Missing_Values = sum(is.na(df[[col]])),
      Min = round(min(df[[col]], na.rm = TRUE), 2),
      Median = round(median(df[[col]], na.rm = TRUE), 2),
      Mean = round(mean(df[[col]], na.rm = TRUE), 2),
      Max = round(max(df[[col]], na.rm = TRUE), 2)
    )
    summary_list[[col]] <- summary_stats
  }
  
  summary_table <- bind_rows(summary_list)
  return(summary_table)
}

# Describe numerical columns for hotspots 
summary_hotspots <- describe_numerical(hotspots, numerical_columns)

# Print the summary table
print(summary_hotspots)
##    Variable Missing_Values       Min  Median    Mean         Max
## 1      temp              0    -21.35   22.07   21.03       43.88
## 2        rh              0      0.00   33.00   36.18      100.00
## 3        ws              0      0.00    8.94    9.53       59.30
## 4        wd              0      0.00  212.00  201.81      360.00
## 5       pcp              0      0.00    0.00    0.27      651.79
## 6      ffmc              0      0.00   91.42   88.78       99.00
## 7       dmc              0      0.00   83.80   90.50      459.78
## 8        dc              0      0.00  528.37  513.39     1122.11
## 9       isi             75      0.00    8.52    8.33      137.70
## 10      bui             75      0.00  117.90  120.70      458.66
## 11      fwi             75      0.00   29.73   28.47      183.90
## 12      ros            631   -429.82    4.27    6.53       96.23
## 13      sfc            631      0.00    3.09    2.69        4.99
## 14      tfc            631      0.00    3.19    3.13        9.95
## 15      bfc         749567      0.00    4.13  298.72 45488500.00
## 16      hfi            631 -91845.00 3665.00 8561.24    93142.00
## 17      cfb          55679      0.00    0.00   33.65      100.00
## 18      age        1378187      0.00    0.00  552.53     6742.00
## 19  estarea        1249981      0.00    8.46    6.48       20.92
## 20  pcuring         695143     -1.00   36.00   38.02      125.00
## 21  cfactor        1034202     -1.00    0.04    0.13        1.00
## 22  greenup         695158     -1.00    1.00    0.77     1527.00
## 23     elev          40851     -1.00  981.00 1006.76     3129.00
## 24      cfl         374667     -1.00    0.97    1.14        8.20
## 25     tfc0         377010      0.00    3.19    2.95        6.04
## 26      sfl         746145     -1.00    7.06    9.02       37.35
## 27  ecozone         742841      4.00   14.00   10.51       14.00
## 28     sfc0         742862      0.00    2.93    2.53        4.95
## 29      cbh        1442811     -1.00    8.44    6.99       17.79

Statistical summary of numerical variables in peak period from the hotspots dataset

# Describe numerical columns for hotspots_peak
summary_hotspots_peak <- describe_numerical(hotspots_peak, numerical_columns)

# Print the summary table
print(summary_hotspots_peak)
##    Variable Missing_Values   Min  Median    Mean         Max
## 1      temp              0 -9.66   22.24   21.48       43.88
## 2        rh              0  7.00   33.00   35.50      100.00
## 3        ws              0  0.00    8.95    9.53       59.30
## 4        wd              0  0.00  213.00  202.23      360.00
## 5       pcp              0  0.00    0.00    0.23      651.79
## 6      ffmc              0  0.30   91.50   89.51       99.00
## 7       dmc              0  0.00   85.11   92.50      459.78
## 8        dc              0  0.00  531.53  518.46     1122.11
## 9       isi              0  0.00    8.62    8.49      137.70
## 10      bui              0  0.00  119.03  123.29      458.66
## 11      fwi              0  0.00   30.06   29.10      183.90
## 12      ros              0  0.00    4.44    6.68       96.23
## 13      sfc              0  0.00    3.12    2.75        4.99
## 14      tfc              0  0.00    3.23    3.20        9.95
## 15      bfc         734414  0.00    4.15  138.96 45488500.00
## 16      hfi              0  0.00 3867.00 8761.14    93142.00
## 17      cfb          44273  0.00    0.00   34.23      100.00
## 18      age        1350060  0.00    0.00  557.06     5971.00
## 19  estarea        1223385  0.00    8.46    6.62       20.92
## 20  pcuring         687686  0.00   36.00   37.37      125.00
## 21  cfactor        1007287  0.00    0.04    0.13        1.00
## 22  greenup         687686 -1.00    1.00    0.61        1.00
## 23     elev          36148 -1.00  979.00 1005.88     3129.00
## 24      cfl         361971 -1.00    0.96    1.14        8.20
## 25     tfc0         363838  0.00    3.22    3.01        6.04
## 26      sfl         727326 -1.00    7.11    9.11       37.35
## 27  ecozone         727354  4.00   14.00   10.43       14.00
## 28     sfc0         727326  0.00    2.96    2.59        4.95
## 29      cbh        1415153 -1.00    8.38    6.92       17.55

The statistical summary of the hotspots_peak dataset shows that there are no missing values in key variables, used by Canadian Forest Fire Weather Index (FWI) System, therefore we can continue with visual analysis and provide detailed description.

Summarize table with mean values for plots.

# Create a table with mean values for plots

# Monthly averages for numerical variables
monthly_avg <- hotspots_peak %>%
  group_by(year, month) %>%
  summarise(avg_temp = mean(temp, na.rm = TRUE),
            avg_rh = mean(rh, na.rm = TRUE),
            avg_ws = mean(ws, na.rm = TRUE),
            avg_pcp = mean(pcp, na.rm = TRUE),
            avg_ffmc = mean(ffmc, na.rm = TRUE),
            avg_dmc = mean(dmc, na.rm = TRUE),
            avg_dc = mean(dc, na.rm = TRUE),
            avg_isi = mean(isi, na.rm = TRUE),
            avg_sfc = mean(sfc, na.rm = TRUE),
            avg_tfc = mean(tfc, na.rm = TRUE),
            avg_bfc = mean(bfc, na.rm = TRUE),
            avg_hfi = mean(hfi, na.rm = TRUE),
            avg_bui = mean(bui, na.rm = TRUE),
            avg_fwi = mean(fwi, na.rm = TRUE),
            avg_ros = mean(ros, na.rm = TRUE),
            .groups = 'drop') 

print(monthly_avg)
## # A tibble: 60 × 17
##     year month avg_temp avg_rh avg_ws avg_pcp avg_ffmc avg_dmc avg_dc avg_isi
##    <dbl> <ord>    <dbl>  <dbl>  <dbl>   <dbl>    <dbl>   <dbl>  <dbl>   <dbl>
##  1  2014 May      16.0    29.8   9.97  1.00       87.4   28.8    79.1   8.24 
##  2  2014 Jun      20.4    27.8   9.62  0.618      89.8   58.6   177.    9.55 
##  3  2014 Jul      26.9    24.4   9.80  0.0342     94.3   83.5   384.   13.8  
##  4  2014 Aug      25.1    31.6   7.86  0.157      91.9   99.7   534.    9.29 
##  5  2014 Sep      21.7    32.7   8.11  0.147      90.4   71.9   672.    8.75 
##  6  2014 Oct       8.93   73.4   8.02  2.38       49.0    7.72  403.    0.707
##  7  2015 May      18.8    30.8  10.5   0.0850     91.6   34.7   102.    9.87 
##  8  2015 Jun      23.3    34.5  10.8   0.284      90.0   62.5   329.    8.33 
##  9  2015 Jul      24.2    34.8  11.0   0.401      91.2   83.5   408.    9.60 
## 10  2015 Aug      22.5    35.1   9.94  0.124      91.5  102.    532.    9.88 
## # ℹ 50 more rows
## # ℹ 7 more variables: avg_sfc <dbl>, avg_tfc <dbl>, avg_bfc <dbl>,
## #   avg_hfi <dbl>, avg_bui <dbl>, avg_fwi <dbl>, avg_ros <dbl>

Temp (Temperature)

# This variable shows temperature in Celsius at the specific location, at the fire event
par(mfrow=c(1,2))

# Histogram to show distribution of temperatures
ggplot(hotspots, aes(x = temp)) +
  geom_histogram(binwidth = 1, fill = "steelblue", color = "black", alpha = 0.7) +
  labs(title = "Distribution of Temperature at Fire Hotspots",
       x = "Temperature (°C)",
       y = "Frequency") +
  scale_y_continuous(labels = comma) +  # Adjust y-axis labels to numeric format (scales package)
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, size = 15),
        axis.title.x = element_text(size = 12),
        axis.title.y = element_text(size = 12),
        axis.text.x = element_text(size = 10),
        axis.text.y = element_text(size = 10))

# Histogram to show distribution of temp for hotspots_peak
ggplot(hotspots_peak, aes(x = temp)) +
  geom_histogram(binwidth = 1, fill = "steelblue", color = "black", alpha = 0.7) +
  labs(title = "Distribution of Temperature at Fire Hotspots (Peak)",
       x = "Temperature (°C)",
       y = "Frequency") +
  scale_y_continuous(labels = comma) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, size = 15),
        axis.title.x = element_text(size = 12),
        axis.title.y = element_text(size = 12),
        axis.text.x = element_text(size = 10),
        axis.text.y = element_text(size = 10))

Average temperature by year

# Creat a summary table with mean temperature by year
mean_temp_by_year <- hotspots_peak %>% 
  group_by(year) %>% 
  summarise(mean_temp = mean(temp, na.rm = TRUE))

# Create a line plot to show temperature trends over years
ggplot(mean_temp_by_year, aes(x = year, y = mean_temp)) +
  geom_line(color = "steelblue", size = 1) +
  geom_point(color = "black", size = 2) +
  labs(title = "Average Temperature Over Years (Peak)",
       x = "Year",
       y = "Average Temperature (°C)") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, size = 15),
        axis.title.x = element_text(size = 12),
        axis.title.y = element_text(size = 12),
        axis.text.x = element_text(size = 10),
        axis.text.y = element_text(size = 10))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Create a line plot for temperature in 2019 for closer look of drop temperature.
ggplot(hotspots_peak %>% filter(year == 2019), aes(x = rep_date, y = temp)) +
  geom_line(color = "steelblue") +
  labs(title = "Temperature Over the Year 2019",
       x = "Date",
       y = "Temperature (°C)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

The line plot shows that temperature gradually drop from ~20 degree to 0 degree started from July to October indicated the end of dry season.

Closer look for the temperature comparison from 4 major different years with most case of fires. 2014 : started of hotspot dataset 2018 : the historical year of wildfire in BC 2020 : the global pandemic year where not much fires recorded 2023 : the largest wildfire events.

# Define common axis limits
y_limits <- range(hotspots_peak$temp[hotspots_peak$year %in% c(2014, 2018, 2020, 2023)])

# Create plots with common axis limits
plot_2014 <- ggplot(hotspots_peak %>% filter(year == 2014), aes(x = rep_date, y = temp)) +
  geom_line(color = "steelblue") +
  labs(title = "Temperature Over the Peak Season 2014",
       x = "Date",
       y = "Temperature (°C)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_y_continuous(limits = y_limits)

plot_2018 <- ggplot(hotspots_peak %>% filter(year == 2018), aes(x = rep_date, y = temp)) +
  geom_line(color = "steelblue") +
  labs(title = "Temperature Over the Peak Season 2018",
       x = "Date",
       y = "Temperature (°C)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_y_continuous(limits = y_limits)

plot_2020 <- ggplot(hotspots_peak %>% filter(year == 2020), aes(x = rep_date, y = temp)) +
  geom_line(color = "steelblue") +
  labs(title = "Temperature Over the Peak Season 2020",
       x = "Date",
       y = "Temperature (°C)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_y_continuous(limits = y_limits)

plot_2023 <- ggplot(hotspots_peak %>% filter(year == 2023), aes(x = rep_date, y = temp)) +
  geom_line(color = "steelblue") +
  labs(title = "Temperature Over the Peak Season 2023",
       x = "Date",
       y = "Temperature (°C)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_y_continuous(limits = y_limits)

# Create Line Plots for 4 different years to compare temp
grid.arrange(plot_2014, plot_2018, plot_2020, plot_2023, ncol = 2)

# Trend on temperature drop in October from different years, showing consistent trend and considered common because its beginning of Fall season in BC

RH (Relative Humidity)

# The amount of moisture in the air as a percentage
# 0 to 100 with a mean 36

# Create Line Plots for 4 different years to compare RH

# Define common axis limits
y_limits <- range(hotspots_peak$rh[hotspots_peak$year %in% c(2014, 2018, 2020, 2023)])
# Create plots with common axis limits
plot_2014 <- ggplot(hotspots_peak %>% filter(year == 2014), aes(x = rep_date, y = rh)) +
  geom_line(color = "steelblue") +
  labs(title = "Humidity Over the Peak Season 2014",
       x = "Date",
       y = "Relative Humidity (%)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_y_continuous(limits = y_limits)

plot_2018 <- ggplot(hotspots_peak %>% filter(year == 2018), aes(x = rep_date, y = rh)) +
  geom_line(color = "steelblue") +
  labs(title = "Humidity Over the Peak Season 2018",
       x = "Date",
       y = "Relative Humidity (%)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_y_continuous(limits = y_limits)

plot_2020 <- ggplot(hotspots_peak %>% filter(year == 2020), aes(x = rep_date, y = rh)) +
  geom_line(color = "steelblue") +
  labs(title = "Humidity Over the Peak Season 2020",
       x = "Date",
       y = "Relative Humidity (%)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_y_continuous(limits = y_limits)

plot_2023 <- ggplot(hotspots_peak %>% filter(year == 2023), aes(x = rep_date, y = rh)) +
  geom_line(color = "steelblue") +
  labs(title = "Humidity Over the Peak Season 2023",
       x = "Date",
       y = "Relative Humidity (%)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_y_continuous(limits = y_limits)

# Arrange the plots side by side
grid.arrange(plot_2014, plot_2018, plot_2020, plot_2023, ncol = 2)

# The variability in relative humidity makes it difficult to establish a consistent trend. 

Comparison between temperature and humidity (RH) in monthly perspective

# Plot monthly temperature averages
temp_plot <- ggplot(monthly_avg, aes(x = month, y = avg_temp, color = factor(year), group = year)) +
  geom_line() +
  labs(title = "Average Temperature by Month",
       x = "Month",
       y = "Average Temperature (°C)",
       color = "Year") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))


# Plot monthly humidity averages
humidity_plot <- ggplot(monthly_avg, aes(x = month, y = avg_rh, color = factor(year), group = year)) +
  geom_line() +
  labs(title = "Average Humidity by Month",
       x = "Month",
       y = "Average Humidity (%)",
       color = "Year") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

grid.arrange(temp_plot, humidity_plot, ncol = 1)

Comparison between temperature and humidity (RH) from 4 different years.

# Filter for specific years
monthly_avg_filtered <- monthly_avg %>% 
  filter(year %in% c(2014, 2018, 2020, 2023))

# Plot monthly temperature averages
temp_plot_filtered <- ggplot(monthly_avg_filtered, aes(x = month, y = avg_temp, color = factor(year), group = year)) +
  geom_line(size = 1) +
  labs(title = "Average Temperature by Month",
       x = "Month",
       y = "Average Temperature (°C)",
       color = "Year") +
  scale_color_manual(values = c("2014" = "#F8766D", "2018" = "#00C19F", "2020" = "#619CFF", "2023" = "#FF61C3")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Plot monthly humidity averages
humidity_plot_filtered <- ggplot(monthly_avg_filtered, aes(x = month, y = avg_rh, color = factor(year), group = year)) +
  geom_line(size = 1) +
  labs(title = "Average Humidity by Month",
       x = "Month",
       y = "Average Humidity (%)",
       color = "Year") +
  scale_color_manual(values = c("2014" = "#F8766D", "2018" = "#00C19F", "2020" = "#619CFF", "2023" = "#FF61C3")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Arrange the plots side by side
grid.arrange(temp_plot_filtered, humidity_plot_filtered, ncol = 1)

There is a combination of high temperatures and low humidity during the peak fire months (June to September). 2018 and 2023 had higher temperatures and lower humidity, meaning more fires. When humidity rises sharply in October, fire activity goes down,indicating the end of the peak fire season.

Wind speed (in km/h)

# 0 to 59 with mean of 9
# Higher wind speeds in peak fire months can make fire more intense and make it spread faster.

# Common wind speed scale used is the Beaufort scale (in m/s)
# Convert wind speed from km/h to m/s (t) - to plot wd later
hotspots_peak$ws <- hotspots_peak$ws * 0.27778

# Plot monthly wind speed averages with custom colors
ws_plot_filtered <- ggplot(monthly_avg_filtered, aes(x = month, y = avg_ws, color = factor(year), group = year)) +
  geom_line(size = 1) +
  labs(title = "Average Wind Speed by Month",
       x = "Month",
       y = "Average Wind Speed (m/s)",
       color = "Year") +
  scale_color_manual(values = c("2014" = "#F8766D", "2018" = "#00C19F", "2020" = "#619CFF", "2023" = "#FF61C3")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Arrange the plots side by side
grid.arrange(temp_plot_filtered, humidity_plot_filtered, ws_plot_filtered, ncol = 1)

Wind speed shows significant variability

“wd” (Wind direction)

# Wind exploration with openair() package.
# Visualise with Wind Rose to see most common wind direction
library(openair)

windRose(mydata = hotspots_peak, ws = "ws", wd = "wd", 
         main = "Wind Rose", paddle = FALSE)

# Most of the wind comes from the west (W) and southwest (SW). 
# Regions east and north of areas with strong western, southern and southwestern winds
# should be careful, as these winds can quickly spread fires.
# Wind patterns have are important for wildfire management.

Most of the wind comes from the west (W) and southwest (SW).Regions east and north of areas with strong western, southern and southwestern wind should be careful, as these winds can quickly spread fires.

Wind patterns have are important for wildfire management. Higher wind speeds in peak fire months can make fire more intense and make it spread faster.Wind speed shows significant variability.

“pcp” (Precipitation)

# 0 to 651 with mean of 0.2
# So the amount is very low generally

# Plot monthly precipitation averages
ggplot(monthly_avg, aes(x = month, y = avg_pcp, color = factor(year), group = year)) +
  geom_line(size = 0.5, alpha = 0.6, linetype = "dotted") +  # raw data
  geom_smooth(se = FALSE, method = "loess", size = 1, linetype = "solid") +  # Dashed line for trend
  labs(title = "Average Precipitation by Month",
       x = "Month",
       y = "Average Precipitation (mm)",
       color = "Year") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## `geom_smooth()` using formula = 'y ~ x'

The plot indicates a clear seasonal variation in precipitation, with a noticeable peak in July for most years.

Fire Weather Index System indices

FFMC (Fine Fuel Moisture Code)

# A numeric rating of the moisture content of litter and other cured fine fuels.
# This code is an indicator of the relative ease of ignition and the flammability of fine fuel.

# The FFMC scale ranges from 0 to 101.
# 0-30: Very wet conditions; ignition is difficult.
# 30-70: Damp conditions; moderate difficulty for ignition.
# 70-85: Dry conditions; fuels are easily ignitable.
# 85-101: Very dry conditions; fuels are highly ignitable and fires can spread rapidly.

# Plot histogram for FFMC
ggplot(hotspots_peak, aes(x = ffmc)) +
  geom_histogram(binwidth = 2, fill = "skyblue", color = "black", alpha = 0.7) +
  labs(title = "Distribution of FFMC Values", x = "FFMC", y = "Frequency") +
  scale_y_continuous(labels = comma) + 
  theme_minimal()

The histogram reveals that most FFMC values from 2014 to 2023 are between 85 and 99,confirming consistently dry conditions.

# Plot boxplot for FFMC
ggplot(hotspots_peak, aes(x = factor(year), y = ffmc)) +
  geom_boxplot(fill = "steelblue", color = "black", alpha = 0.7) +
  labs(title = "FFMC Distribution Across Years",
       x = "Year",
       y = "FFMC") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, size = 15),
        axis.title.x = element_text(size = 12),
        axis.title.y = element_text(size = 12),
        axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
        axis.text.y = element_text(size = 10))

The boxplots show that most years have high FFMC values above 90,meaning conditions are very dry and prone to fires. Year of 2020 and 2019 are 2 years of less drought conditions for fine fuels

# Line plot for FFMC over time
ggplot(monthly_avg, aes(x = month, y = avg_ffmc, color = factor(year), group = year)) +
  geom_line(size = 0.5, alpha = 0.6, linetype = "dotted") +  # raw data
  geom_smooth(se = FALSE, method = "loess", size = 1, linetype = "solid") +  # Dashed line for trend
  labs(title = "FFMC by Month",
       x = "Month",
       y = "FFMC",
       color = "Year") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## `geom_smooth()` using formula = 'y ~ x'

The line plot shows that FFMC values peak every year during the fire season,indicating drier conditions that make it easier for fires to start.

DMC (Duff Moisture Code)

# A numeric rating of the average moisture content
# of loosely compacted organic layers of moderate depth. 
# This code gives an indication of fuel consumption in moderate duff layers 
# and medium-size woody material.

# Values below 20: Low fire danger, duff layers are wet, and ignition is unlikely.
# Values between 21 and 40: Moderate fire danger, duff layers start drying.
# Values between 41 and 100: High fire danger, the duff layer is dry, prone to ignition.
# Values above 100: Extreme fire danger, the duff layer is very dry, and ignition is very likely with the potential for intense fires.

# Plot histogram for DMC
ggplot(hotspots_peak, aes(x = dmc)) +
  geom_histogram(binwidth = 2, fill = "skyblue", color = "black", alpha = 0.7) +
  labs(title = "Distribution of DMC Values", x = "DMC", y = "Frequency") +
  scale_y_continuous(labels = comma) + 
  theme_minimal()

The histogram shows that most DMC values range between 0 and 100,with a peak around 50-100. Values above 200 are less common but indicate long dry periods.

# Plot boxplot for DMC
ggplot(hotspots_peak, aes(x = factor(year), y = dmc)) +
  geom_boxplot(fill = "steelblue", color = "black", alpha = 0.7) +
  labs(title = "DMC Distribution Across Years",
       x = "Year",
       y = "DMC") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, size = 15),
        axis.title.x = element_text(size = 12),
        axis.title.y = element_text(size = 12),
        axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
        axis.text.y = element_text(size = 10))

The boxplot for each year reveals that 2017 and 2018 had particularly high outliers,indicating extremely dry conditions that could lead to fires.

On the other hand, 2016 and 2019 had lower DMC values, showing less severe drought.

# Line plot for DMC over time
ggplot(monthly_avg, aes(x = month, y = avg_dmc, color = factor(year), group = year)) +
  geom_line(size = 0.5, alpha = 0.6, linetype = "dotted") +  # raw data
  geom_smooth(se = FALSE, method = "loess", size = 1, linetype = "solid") +  # Dashed line for trend
  labs(title = "DMC by Month",
       x = "Month",
       y = "DMC",
       color = "Year") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## `geom_smooth()` using formula = 'y ~ x'

The line plot indicates that DMC values peak in late summer (July-August) and decrease in autumn.

This shows that mid to late summer is typically drier,the risk of fire ignition and spread is high during these months.

DC (Drought Code)

# A numeric rating of the average moisture content of deep, compact organic layers.
# This code is a useful indicator of seasonal drought effects on forest fuels
# and the amount of smoldering in deep duff layers and large logs.

# 0-100: Indicates wet conditions with low fire potential.
# 100-300: Moderate drought conditions with increasing fire potential.
# 300-500: High drought conditions, leading to high fire risk.
# 500+: Indicates extreme drought, posing a very high fire risk and potential for intense, prolonged burning.

# Plot histogram for DC
ggplot(hotspots_peak, aes(x = dc)) +
  geom_histogram(binwidth = 20, fill = "skyblue", color = "black", alpha = 0.7) +
  labs(title = "Distribution of DC Values", x = "DC", y = "Frequency") +
  scale_y_continuous(labels = comma) + 
  theme_minimal()

The histogram of DC values from 2014 to 2023 shows most values are between 300 and 600,peaking around 500. This means BC often has conditions that can lead to deep-burning fires.

# Plot boxplot for DC
ggplot(hotspots_peak, aes(x = factor(year), y = dc)) +
  geom_boxplot(fill = "steelblue", color = "black", alpha = 0.7) +
  labs(title = "DC Distribution Across Years",
       x = "Year",
       y = "DC") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, size = 15),
        axis.title.x = element_text(size = 12),
        axis.title.y = element_text(size = 12),
        axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
        axis.text.y = element_text(size = 10))

2017 and 2018: These years have higher median and upper whisker values, meaning very dry conditions, likely leading to severe fire seasons.

2016 and 2019: These years show lower DC values, suggesting wetter conditions and potentially less severe fire activity.

# Line plot for DC over time
ggplot(monthly_avg, aes(x = month, y = avg_dc, color = factor(year), group = year)) +
  geom_line(size = 0.5, alpha = 0.6, linetype = "dotted") +  # raw data
  geom_smooth(se = FALSE, method = "loess", size = 1, linetype = "solid") +  # Dashed line for trend
  labs(title = "DC by Month",
       x = "Month",
       y = "DC",
       color = "Year") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## `geom_smooth()` using formula = 'y ~ x'

The line plot of DC values by month shows a clear seasonal trend, DC values increase during the summer months (July to August) and decrease in the autumn. This occurs at the time of the peak fire season. There are a lot of high DC values, more than 500, it shows conditions for deep-burning fires.

Visualization between Fire Indices with fire counts (FFMC, DMC, DC)

# Plot FFMC and Fire Counts
ggplot(monthly_data, aes(x = Date)) +
  geom_line(aes(y = avg_ffmc, color = "FFMC"), size = 1) +
  geom_bar(aes(y = norm_n_events * max_ffmc, fill = "Fire Occurrences"), stat = "identity", color = "black", alpha = 0.6) +
  scale_y_continuous(
    name = "FFMC",
    sec.axis = sec_axis(~ . * 1000 / max_ffmc * max(monthly_data$n_events) / 1000, 
                        name = "Number of Fire Occurrences", labels = comma)) +
  scale_x_date(date_labels = "%Y", date_breaks = "1 year") +
  labs(title = "Monthly Trends of FFMC and Fire Occurrences",
       x = "Year",
       color = "Index",
       fill = "Index") +
  scale_color_manual(values = c("FFMC" = "lightblue")) +
  scale_fill_manual(values = c("Fire Occurrences" = "red")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Plot DMC and Fire Counts
ggplot(monthly_data, aes(x = Date)) +
  geom_line(aes(y = avg_dmc, color = "DMC"), size = 1) +
  geom_bar(aes(y = norm_n_events * max_dmc, fill = "Fire Occurrences"), stat = "identity", color = "black", alpha = 0.6) +
  scale_y_continuous(
    name = "DMC",
    sec.axis = sec_axis(~ . * 1000 / max_dmc * max(monthly_data$n_events) / 1000, 
                        name = "Number of Fire Occurrences", labels = comma)) +
  scale_x_date(date_labels = "%Y", date_breaks = "1 year") +
  labs(title = "Monthly Trends of DMC and Fire Occurrences",
       x = "Year",
       color = "Index",
       fill = "Index") +
  scale_color_manual(values = c("DMC" = "lightblue")) +
  scale_fill_manual(values = c("Fire Occurrences" = "red")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Plot DC and Fire Counts
ggplot(monthly_data, aes(x = Date)) +
  geom_line(aes(y = avg_dc, color = "DC"), size = 1) +
  geom_bar(aes(y = norm_n_events * max_dc, fill = "Fire Occurrences"), stat = "identity", color = "black", alpha = 0.6) +
  scale_y_continuous(
    name = "DC",
    sec.axis = sec_axis(~ . * 1000 / max_dc * max(monthly_data$n_events) / 1000, 
                        name = "Number of Fire Occurrences", labels = comma)) +
  scale_x_date(date_labels = "%Y", date_breaks = "1 year") +
  labs(title = "Monthly Trends of DC and Fire Occurrences",
       x = "Year",
       color = "Index",
       fill = "Index") +
  scale_color_manual(values = c("DC" = "lightblue")) +
  scale_fill_manual(values = c("Fire Occurrences" = "red")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

The plots show a clear relationship between the fire indices (FFMC, DMC, and DC) and the number of fires. When the number of fires is high, the indices also show higher values.

ISI (Initial Spread Index)

# A numerical rating of the expected rate of fire spread
# based on wind speed, temperature, and fine fuel moisture content. 
# ISI is crucial for understanding how quickly a fire can spread once it has ignited.

# 0-3: Low spread potential. Fires will spread slowly and are relatively easy to control.
# 4-7: Moderate spread potential. Fires spread more quickly and may require more effort to control.
# 8-12: High spread potential. Fires spread rapidly and can be difficult to control.
# 13-19: Very high spread potential. Fires spread very rapidly and are challenging to control.
# 20+: Extreme spread potential. Fires spread uncontrollably and can be extremely dangerous.

# Plot histogram for ISI
ggplot(hotspots_peak, aes(x = isi)) +
  geom_histogram(binwidth = 1, fill = "skyblue", color = "black", alpha = 0.7) +
  labs(title = "Distribution of ISI Values", x = "ISI", y = "Frequency") +
  scale_y_continuous(labels = scales::comma) + 
  theme_minimal()

The histogram indicates that most ISI values are low to moderate.

# Plot boxplot for ISI
ggplot(hotspots_peak, aes(x = factor(year), y = isi)) +
  geom_boxplot(fill = "steelblue", color = "black", alpha = 0.7) +
  labs(title = "ISI Distribution Across Years",
       x = "Year",
       y = "ISI") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, size = 15),
        axis.title.x = element_text(size = 12),
        axis.title.y = element_text(size = 12),
        axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
        axis.text.y = element_text(size = 10))

The boxplot shows variability in ISI values across years, with notable outliers in 2014 and 2020.

# Line plot for ISI over time
ggplot(monthly_avg, aes(x = month, y = avg_isi, color = factor(year), group = year)) +
  geom_line(size = 0.5, alpha = 0.6, linetype = "dotted") +  # raw data
  geom_smooth(se = FALSE, method = "loess", size = 1, linetype = "solid") +  # Smoothed trend line
  labs(title = "ISI by Month",
       x = "Month",
       y = "ISI",
       color = "Year") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## `geom_smooth()` using formula = 'y ~ x'

The graphs illustrate that ISI values tend to peak during the summer months,indicating higher fire spread potential during this period.

Analyze extreme outlier of ISI

# To improve the clarity of these insights,
# removing NA values and extreme outliers from the dataset is necessary.
# Remove extreme outliers
hotspots_peak_filtered <- hotspots_peak%>%
  filter(isi < 60)
dim(hotspots_peak_filtered)
## [1] 1734602      42
# Filter out entries with ISI values greater than 60
event_outliers <- hotspots_peak %>%
  filter(isi >= 60)

# Check single event
dim(event_outliers)
## [1] 11 42
view(event_outliers)

Sample of one specific event with FWI index

# The data from July 19, 2014, shows very high ISI values at a specific location in British Columbia.
# This matches the date of the Mount McAllister fire, a large wildfire that burned over 20,000 hectares.
# The weather that day included high temperatures, low humidity, and strong winds, which made the fire spread quickly.
# These conditions explain the high ISI values, indicating a high potential for severe fire behavior.
# The data points from July 19, 2014, are all in the same cluster, showing that the clustering algorithm grouped them correctly.


# Filter the specific event cluster and near events
event_McAllister <- hotspots %>%
  filter(event_cluster %in% 370:372) 

# Calculate monthly averages for July 2014
monthly_avg_july <- monthly_avg %>%
  filter(year == 2014 & month == "Jul")

# Plot the weather conditions with average lines
ggplot(event_McAllister, aes(x = rep_date)) +
  geom_point(aes(y = temp, color = "Temperature"), size = 3) +
  geom_point(aes(y = rh, color = "Relative Humidity"), size = 3) +
  geom_point(aes(y = ws, color = "Wind Speed"), size = 3) +
  geom_hline(aes(yintercept = monthly_avg_july$avg_temp, color = "Avg Temperature"), linetype = "dashed") +
  geom_hline(aes(yintercept = monthly_avg_july$avg_rh, color = "Avg Relative Humidity"), linetype = "dashed") +
  geom_hline(aes(yintercept = monthly_avg_july$avg_ws, color = "Avg Wind Speed"), linetype = "dashed") +
  labs(title = "Weather Conditions on July 19, 2014 with Monthly Averages",
       x = "Date",
       y = "Value",
       color = "Variable") +
  scale_color_manual(values = c("Temperature" = "red3", "Relative Humidity" = "steelblue2", "Wind Speed" = "green",
                                "Avg Temperature" = "red3", "Avg Relative Humidity" = "steelblue2", "Avg Wind Speed" = "green")) +
  theme_minimal()

# Create a summary table of the weather conditions
weather_summary <- event_McAllister %>%
  select(lat, lon, rep_date, temp, rh, ws, wd, pcp, ffmc, dmc, dc, isi, bui, fwi, ros)

# Print the summary table
print(weather_summary)
## # A tibble: 110 × 15
##      lat   lon rep_date             temp    rh    ws    wd   pcp  ffmc   dmc
##    <dbl> <dbl> <dttm>              <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1  53.2 -126. 2014-08-08 04:21:00  14.3    60  10.2     0   0.2  87.9  119.
##  2  50.3 -122. 2014-07-19 21:24:00  28.4    28  22.6   206   0    93.3  201 
##  3  53.3 -125. 2014-07-16 20:45:00  27.3    23   6.6   249   0    95.1   86 
##  4  53.3 -125. 2014-07-16 20:49:00  26.6    22   7.4    22   0    95.1   91 
##  5  50.3 -122. 2014-07-19 21:24:00  27.5    33  59.3   190   0    93.4  237.
##  6  53.2 -125. 2014-08-08 04:21:00  14.3    60  10.2     0   0.2  87.9  119.
##  7  53.3 -126. 2014-07-16 20:49:00  26.6    22   7.4    22   0    95.1   91 
##  8  53.4 -126. 2014-07-16 20:45:00  26.6    22   7.4    22   0    95.1   91 
##  9  53.3 -125. 2014-07-16 20:49:00  27.3    23   6.6   249   0    95.1   86 
## 10  53.3 -126. 2014-07-16 20:49:00  26.6    22   7.4    22   0    95.1   91 
## # ℹ 100 more rows
## # ℹ 5 more variables: dc <dbl>, isi <dbl>, bui <dbl>, fwi <dbl>, ros <dbl>

This graph shows the weather conditions at McAllister Creek on July 19, 2014,including temperature, relative humidity, and wind speed. It compares these values to the average conditions for July 2014.

The wind speed during this event is notably higher than the monthly average,suggesting stronger winds that could help spread the fire.

Visualization of ISI after removing extreme value

# Plot histogram for ISI
ggplot(hotspots_peak, aes(x = isi)) +
  geom_histogram(binwidth = 2, fill = "skyblue", color = "black", alpha = 0.7) +
  labs(title = "Distribution of ISI Values", x = "ISI", y = "Frequency") +
  scale_y_continuous(labels = scales::comma) + 
  theme_minimal()

The histogram now shows a more typical range of ISI values, removing the extreme values that distorted the original plot.

# Plot boxplot for ISI
ggplot(hotspots_peak_filtered, aes(x = factor(year), y = isi)) +
  geom_boxplot(fill = "steelblue", color = "black", alpha = 0.7) +
  labs(title = "ISI Distribution Across Years",
       x = "Year",
       y = "ISI") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, size = 15),
        axis.title.x = element_text(size = 12),
        axis.title.y = element_text(size = 12),
        axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
        axis.text.y = element_text(size = 10))

The boxplot shows a clearer, more consistent ISI distribution across the years after removing high outliers.

# Line plot for ISI over time
ggplot(monthly_avg, aes(x = month, y = avg_isi, color = factor(year), group = year)) +
  geom_line(size = 0.5, alpha = 0.6, linetype = "dotted") +  # raw data
  geom_smooth(se = FALSE, method = "loess", size = 1, linetype = "solid") +  # Smoothed trend line
  labs(title = "ISI by Month",
       x = "Month",
       y = "ISI",
       color = "Year") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## `geom_smooth()` using formula = 'y ~ x'

The seasonal pattern plot still peaks in mid-summer but appears smoother without the extreme values.

Correlation of ISI and Weather Indices

# Using monthly averages instead of individual observations for quicker plotting

ggplot(monthly_avg, aes(x = avg_ws, y = avg_isi)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Average ISI vs Average Wind Speed", x = "Average Wind Speed (km/h)", y = "Average ISI") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Average ISI vs. Average Wind Speed: Shows that higher wind speeds are linked to higher ISI values.

ggplot(monthly_avg, aes(x = avg_temp, y = avg_isi)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Average ISI vs Average Temperature", x = "Average Temperature (°C)", y = "Average ISI") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Average ISI vs. Average Temperature: Shows that higher temperatures are associated with higher ISI values.

ggplot(monthly_avg, aes(x = avg_rh, y = avg_isi)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", se = FALSE, color = "green") +
  labs(title = "Average ISI vs Average Relative Humidity", x = "Average Relative Humidity (%)", y = "Average ISI") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Average ISI vs. Average Relative Humidity: Shows that as relative humidity increases, ISI decreases.

Correlation matrix between ISI and weather variables (WS, temp, RH)

# Select columns 
data_corr <- hotspots_peak %>%
  select(isi, ws, temp, rh)

# Calculate the correlation matrix
corr_matrix <- cor(data_corr)

# Plot the correlation matrix
corrplot(corr_matrix, method = "circle", type = "lower",
         tl.col = "black", tl.srt = 45, title = "Correlation Matrix of ISI and Weather Variables",
         mar = c(0, 0, 1, 0))

Strong negative correlation between ISI and relative humidity. Strong positive correlation between ISI and temperature. Moderate positive correlation between ISI and wind speed.

BUI (Buildup Index)

# A numerical rating of the total amount of fuel available for combustion.
# It is derived from the Duff Moisture Code (DMC) and the Drought Code (DC)

# Low: 0-40
# Moderate: 41-80
# High: 81-120
# Extreme: 121 and above

ggplot(hotspots_peak, aes(x = bui)) +
  geom_histogram(binwidth = 10, fill = "skyblue", color = "black", alpha = 0.7) +
  labs(title = "Distribution of BUI Values", x = "BUI", y = "Frequency") +
  scale_y_continuous(labels = scales::comma) + 
  theme_minimal()

The histogram shows that BUI values mostly range from 60 to 100. This means the fire potential is moderate to high for most of the dataset.

ggplot(hotspots_peak, aes(x = factor(year), y = bui)) +
  geom_boxplot(fill = "steelblue", color = "black", alpha = 0.7) +
  labs(title = "BUI Distribution Across Years",
       x = "Year",
       y = "BUI") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, size = 15),
        axis.title.x = element_text(size = 12),
        axis.title.y = element_text(size = 12),
        axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
        axis.text.y = element_text(size = 10))

Boxplots show BUI values vary year to year. Some years have higher values, which means drier conditions and a higher chance of intense fires.

ggplot(monthly_avg, aes(x = month, y = avg_bui, color = factor(year), group = year)) +
  geom_line(size = 0.5, alpha = 0.6, linetype = "dotted") +  # raw data
  geom_smooth(se = FALSE, method = "loess", size = 1, linetype = "solid") +  # Smoothed trend line
  labs(title = "BUI by Month",
       x = "Month",
       y = "BUI",
       color = "Year") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## `geom_smooth()` using formula = 'y ~ x'

BUI peaks in mid-summer, which matches the time when fire activity is usually the highest.

# Scatter plots to visualize relationships using average values
ggplot(monthly_avg, aes(x = avg_dmc, y = avg_bui)) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Average BUI vs Average DMC", x = "Average DMC", y = "Average BUI") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

BUI and DC also show a positive correlation,though it is slightly weaker.

Correlation matrix of BUI, DMC and FC

# Select columns
data_corr <- hotspots_peak %>%
  select(bui, dmc, dc)

# Calculate the correlation matrix
corr_matrix <- cor(data_corr)

# Plot the correlation matrix
corrplot(corr_matrix, method = "circle", type = "lower",
         tl.col = "black", tl.srt = 45, title = "Correlation Matrix of BUI, DMC, and DC",
         mar = c(0, 0, 1, 0))

BUI, DMC, and DC are all indicators of fire potential. High values in these indices suggest drier conditions, which can lead to higher fire.

FWI (Fire Weather Index)

# A numeric rating of fire intensity. It is based on the ISI and the BUI, 
# and is used as a general index of fire danger throughout the forested areas of Canada.

# Low (0-5): Minimal fire danger.
# Moderate (6-12): Fires can start from most accidental causes, but the spread is slow.
# High (13-20): Fires can start easily and spread rapidly.
# Very High (21-30): Fires will start very easily, spread rapidly, and burn intensely.
# Extreme (31+): Fires start and spread quickly, and are intense and challenging to control.

ggplot(hotspots_peak, aes(x = fwi)) +
  geom_histogram(binwidth = 1, fill = "skyblue", color = "black", alpha = 0.7) +
  labs(title = "Distribution of FWI Values", x = "FWI", y = "Frequency") +
  scale_y_continuous(labels = scales::comma) + 
  theme_minimal()

The histogram shows that most FWI values are between 10 and 30, fire danger is from moderate to very high. Many values are above 30, severe fire weather conditions are frequent.

ggplot(hotspots_peak, aes(x = factor(year), y = fwi)) +
  geom_boxplot(fill = "steelblue", color = "black", alpha = 0.7) +
  labs(title = "FWI Distribution Across Years",
       x = "Year",
       y = "FWI") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, size = 15),
        axis.title.x = element_text(size = 12),
        axis.title.y = element_text(size = 12),
        axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
        axis.text.y = element_text(size = 10))

The boxplot shows the distribution of FWI values for different years.The line plot shows that FWI peaks in mid-summer, peak fire activity period.The data shows significant variability in fire danger, summer months and certain years have extreme outliers.

ggplot(monthly_avg, aes(x = month, y = avg_fwi, color = factor(year), group = year)) +
  geom_line(size = 0.5, alpha = 0.6, linetype = "dotted") +  # raw data
  geom_smooth(se = FALSE, method = "loess", size = 1, linetype = "solid") +  # Smoothed trend line
  labs(title = "FWI by Month",
       x = "Month",
       y = "FWI",
       color = "Year") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## `geom_smooth()` using formula = 'y ~ x'

The line plot shows that FWI peaks in mid-summer, peak fire activity period. The data shows significant variability in fire danger, summer months and certain years have extreme outliers.

Fuel (Fuel Type)

# D1: Deciduous trees (leafless, early spring to fall)
# C2, C3, C4, C5, C7: Various types of coniferous trees
# O1, O1a, O1b: Grass or herbaceous vegetation
# M1, M1M2, M2, M2_25, M2_35, M2_50, M2_65: Mixedwood or transitional vegetation
# bog: Wetland areas with peatland vegetation
# water: Water bodies
# urban: Urban areas
# non_fuel: Areas with no significant vegetation (rock, gravel)
# low_veg: Areas with low vegetation (possibly recently burned or cleared)
# farm: Agricultural areas
# D2: Deadfall or downed wood

# Group by fuel and summarize
fuel_counts <- hotspots_peak %>%
  group_by(fuel) %>%
  summarise(count = n()) %>%
  arrange(desc(count)) 

# Print the fuel counts
print(fuel_counts)
## # A tibble: 27 × 2
##    fuel   count
##    <chr>  <int>
##  1 C2    566652
##  2 C3    383781
##  3 D1    187812
##  4 C7    149801
##  5 C5     71006
##  6 D2     68185
##  7 M2_65  60189
##  8 C1     43814
##  9 M2_50  41250
## 10 O1b    37656
## # ℹ 17 more rows
# Create a bar plot for the fuel column
ggplot(hotspots_peak, aes(x = fuel)) +
  geom_bar(fill = "skyblue", color = "black", alpha = 0.7) +
  labs(title = "Distribution of Fuel Types", x = "Fuel Type", y = "Count") +
  theme_minimal() +
  scale_y_continuous(labels = scales::comma) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

The dataset shows a variety of fuel types, with “C2” (566,651 records) and “C3” (383,781 records) being the most common. Other significant fuel types include “D1” (187,803 records) and “C7” (149,801 records).

“ros” (Rate of Spread)

# The predicted speed of the fire at the front or head of the fire (where the fire moves fastest),and takes into account both crowning and spotting. 
# The scale ranges from 0 to over 20 m/min in the dataset.

ggplot(hotspots_peak, aes(x = ros)) +
  geom_histogram(binwidth = 1, fill = "steelblue", color = "black", alpha = 0.7) +
  labs(title = "Distribution Rate of Spread at Fire Hotspots",
       x = "ROS (m/min)",
       y = "Frequency") +
  scale_y_continuous(labels = comma) + 
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, size = 15),
        axis.title.x = element_text(size = 12),
        axis.title.y = element_text(size = 12),
        axis.text.x = element_text(size = 10),
        axis.text.y = element_text(size = 10))

ggplot(hotspots_peak, aes(x = factor(year), y = ros)) +
  geom_boxplot(fill = "steelblue", color = "black", alpha = 0.7) +
  labs(title = "Rate of Spread Distribution Across Years",
       x = "Year",
       y = "ROS (m/min)") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, size = 15),
        axis.title.x = element_text(size = 12),
        axis.title.y = element_text(size = 12),
        axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
        axis.text.y = element_text(size = 10))

The plot shows that the ROS can vary significantly year to year, with some years experiencing more extreme fire spread conditions.

# Scatter plot for average ROS vs average ISI
ggplot(monthly_avg, aes(x = avg_isi, y = avg_ros)) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Average ROS vs Average ISI",
       x = "Average ISI",
       y = "Average ROS") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# Scatter plot for average ROS vs average BUI
ggplot(monthly_avg, aes(x = avg_bui, y = avg_ros)) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Average ROS vs Average BUI",
       x = "Average BUI",
       y = "Average ROS") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

 # Scatter plot for average ROS vs average FWI
ggplot(monthly_avg, aes(x = avg_fwi, y = avg_ros)) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, color = "green") +
  labs(title = "Average ROS vs Average FWI",
       x = "Average FWI",
       y = "Average ROS") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

The scatter plots for ROS vs FWI, ROS vs BUI, and ROS vs ISI all show a clear upward trend.This means that as the values of FWI, BUI, and ISI increase, the Rate of Spread also increases. Higher values indicate more severe fire conditions,more available fuel, and faster initial fire spread. This means the fire spreads faster.

Correlation matrix between ROS, ISI, BUI and FWI

# Calculate correlations between ROS, ISI, BUI, and FWI
correlation_ros_indices <- hotspots_peak %>%
  select(ros, isi, bui, fwi) %>%
  cor(use = "complete.obs")

# Print the correlation matrix
print(correlation_ros_indices)
##           ros       isi       bui       fwi
## ros 1.0000000 0.5868442 0.2678373 0.5416872
## isi 0.5868442 1.0000000 0.4671475 0.9297335
## bui 0.2678373 0.4671475 1.0000000 0.7159426
## fwi 0.5416872 0.9297335 0.7159426 1.0000000
# Plot the correlation matrix
corrplot(correlation_ros_indices, method = "circle", type = "lower",
         tl.col = "black", tl.srt = 45, title = "Correlation Matrix of ROS and Other Indices",
         mar = c(0, 0, 1, 0))

The matrix shows strong positive correlations between ROS and ISI, ROS and FWI,and a moderate positive correlation between ROS and BUI.

“sfc” (Surface Fuel Consumption)

# SFC represents the amount of fuel consumed by surface fires, 
# including organic soil (duff), surface litter, dead and downed woody debris.
# Calculated using indices of the FWI System (BUI and DC).

print(monthly_avg)
## # A tibble: 60 × 17
##     year month avg_temp avg_rh avg_ws avg_pcp avg_ffmc avg_dmc avg_dc avg_isi
##    <dbl> <ord>    <dbl>  <dbl>  <dbl>   <dbl>    <dbl>   <dbl>  <dbl>   <dbl>
##  1  2014 May      16.0    29.8   9.97  1.00       87.4   28.8    79.1   8.24 
##  2  2014 Jun      20.4    27.8   9.62  0.618      89.8   58.6   177.    9.55 
##  3  2014 Jul      26.9    24.4   9.80  0.0342     94.3   83.5   384.   13.8  
##  4  2014 Aug      25.1    31.6   7.86  0.157      91.9   99.7   534.    9.29 
##  5  2014 Sep      21.7    32.7   8.11  0.147      90.4   71.9   672.    8.75 
##  6  2014 Oct       8.93   73.4   8.02  2.38       49.0    7.72  403.    0.707
##  7  2015 May      18.8    30.8  10.5   0.0850     91.6   34.7   102.    9.87 
##  8  2015 Jun      23.3    34.5  10.8   0.284      90.0   62.5   329.    8.33 
##  9  2015 Jul      24.2    34.8  11.0   0.401      91.2   83.5   408.    9.60 
## 10  2015 Aug      22.5    35.1   9.94  0.124      91.5  102.    532.    9.88 
## # ℹ 50 more rows
## # ℹ 7 more variables: avg_sfc <dbl>, avg_tfc <dbl>, avg_bfc <dbl>,
## #   avg_hfi <dbl>, avg_bui <dbl>, avg_fwi <dbl>, avg_ros <dbl>
ggplot(hotspots_peak, aes(x = sfc)) +
  geom_histogram(binwidth = 0.5, fill = "steelblue", color = "black") +
  labs(title = "Distribution of Surface Fuel Consumption (SFC)",
       x = "SFC (kg/m²)",
       y = "Frequency") +
  theme_minimal() + 
  scale_y_continuous(labels = comma)

“tfc” (Total Fuel Consumption)

# TFC is the total amount of fuel consumed by both surface and crown fires. 
# It includes all components in SFC plus the consumption of overstory fuels (canopy).
# Estimates the overall carbon emissions from a fire event.
# Range varies widely based on the intensity and spread of the fire, as well as the initial fuel load and moisture content.

ggplot(hotspots_peak, aes(x = tfc)) +
  geom_histogram(binwidth = 0.5, fill = "steelblue", color = "black") +
  labs(title = "Distribution of Total Fuel Consumption (TFC)",
       x = "TFC (kg/m²)",
       y = "Frequency") +
  theme_minimal() +
  scale_y_continuous(labels = comma)

“bfc” (Bonfire Fuel Consumption)

# BFC refers to a specific component of the BORFIRE model focused on estimating carbon emissions from boreal forest fires.

summary(hotspots_peak$bfc)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##        0        3        4      139        5 45488500   734414
# Explore huge bfc values
huge_bfc <- hotspots_peak %>%
  filter(bfc > 1000)

# Print the resulting data frame to inspect the entries with huge bfc values

huge_bfc %>%
  group_by(year, month) %>%
  summarise(n_entries = n()) %>%
  arrange(year, month)
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
## # A tibble: 4 × 3
## # Groups:   year [2]
##    year month n_entries
##   <dbl> <ord>     <int>
## 1  2020 Aug           3
## 2  2020 Oct          87
## 3  2021 Sep           8
## 4  2021 Oct         102
summary(huge_bfc$bfc)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     1004     2103     3335   673558    19825 45488500
# Remove NA values from BFC data
hotspots_peak_BFC <- hotspots_peak %>%
  filter(!is.na(bfc))
# Calculate IQR for BFC
IQR_bfc <- IQR(hotspots_peak_BFC$bfc, na.rm = TRUE)
Q1_bfc <- quantile(hotspots_peak_BFC$bfc, 0.25, na.rm = TRUE)
Q3_bfc <- quantile(hotspots_peak_BFC$bfc, 0.75, na.rm = TRUE)

# Define lower and upper bounds for outliers
upper_bound_bfc <- Q3_bfc + 1.5 * IQR_bfc

# Filter out the outliers
hotspots_peak_BFC <- hotspots_peak_BFC %>%
  filter(bfc <= upper_bound_bfc)

# Summary of filtered data
summary(hotspots_peak_BFC$bfc)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   3.265   4.057   3.743   4.659   7.274
# Plot Histogram of BFC Data Without Handling Outliers
ggplot(hotspots_peak_BFC, aes(x = bfc)) +
  geom_histogram(binwidth = 0.5, fill = "steelblue", color = "black") +
  labs(title = "Distribution of Boreal Fire Consumption (BFC)",
       x = "BFC",
       y = "Frequency") +
  theme_minimal() + 
  scale_y_continuous(labels = scales::comma)

“hfi” (Head Fire Intensity)

# Measures the intensity or energy output of a fire at its front (head).
# HFI is measured in kilowatts per metre (kW/m) of the fire front and is calculated based on the Rate of Spread (ROS)
# and the Total Fuel Consumption (TFC).

# Low (0-500 kW/m): Fires are relatively easy to control and generally cause limited damage.
# Moderate (500-2000 kW/m): Fires can be challenging to control, requiring more resources and effort.
# High (2000-4000 kW/m): Fires are intense and difficult to manage, often requiring aerial firefighting resources.
# Very High (4000-10,000 kW/m): Fires are extremely intense and nearly impossible to control, posing significant risk to life and property.
# Extreme (10,000+ kW/m): Fires exhibit explosive behavior and can cause catastrophic damage.

# Plot histogram for HFI
ggplot(hotspots_peak, aes(x = hfi)) +
  geom_histogram(binwidth = 1000, fill = "skyblue", color = "black", alpha = 0.7) + # Each bin represent a range of 1000 HFI units.
  labs(title = "Distribution of HFI Values",
       x = "HFI (kW/m)",
       y = "Frequency") +
  scale_y_continuous(labels = scales::comma) + 
  scale_x_continuous(breaks = seq(0, 75000, 10000)) +
  theme_minimal()

The histogram shows the frequency distribution of HFI values.Most HFI values are concentrated at the lower end of the scale, as values increase there are fewer.

Very high HFI values are outliers they show occasional extremely intense fires. While most fires are less intense, the few high-intensity fires can be significant and impactful.

ggplot(hotspots_peak, aes(x = factor(year), y = hfi)) +
  geom_boxplot(fill = "steelblue", color = "black", alpha = 0.7) +
  labs(title = "HFI Distribution Across Years",
       x = "Year",
       y = "HFI (kW/m)") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, size = 15),
        axis.title.x = element_text(size = 12),
        axis.title.y = element_text(size = 12),
        axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
        axis.text.y = element_text(size = 10))

The boxplots show the distribution of HFI across different years.The median value varies yearly, there are fluctuations in fire intensity.There are significant outliers in almost all years, there are some fires with extremely high intensities.

ggplot(monthly_avg, aes(x = month, y = avg_hfi, color = factor(year), group = year)) +
  geom_line(size = 0.5, alpha = 0.6, linetype = "dotted") +  # raw data
  geom_smooth(se = FALSE, method = "loess", size = 1, linetype = "solid") +  # Smoothed trend line
  labs(title = "HFI by Month",
       x = "Month",
       y = "HFI (kW/m)",
       color = "Year") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## `geom_smooth()` using formula = 'y ~ x'

Different years show a lot of differences in HFI values. For example, 2014 and 2021 had higher peaks, meaning there was more intense fire activity in those years.

2020 had lower HFI values, there were milder fire conditions or better fire control efforts.The plots also show how HFI can change from month to month within a single fire season.Weather conditions, temperature, humidity, and wind speed, can affect how fires behave.

The HFI index shows notable peaks, particularly in the years 2015, 2018, and 2019, showing periods with high fire intensity.These peaks suggest severe fire conditions during these years.

The number of fire events is notably higher in the years with higher HFI values, such as 2018 and 2019.

“cfb” (Crown Fraction Burned)

# CFB is the predicted fraction of the tree crowns consumed by the fire.
# 0 to 100 with a mean of 33
# It ranges from 0 to 100%, 
# where 0% means no crown fire activity and 100% indicates 
# that the entire tree crowns in the area have been burned.

ggplot(hotspots_peak, aes(x = cfb)) +
  geom_histogram(binwidth = 1, fill = "steelblue", color = "black", alpha = 0.7) +
  labs(title = "Distribution of CFB at Fire Hotspots",
       x = "CFB",
       y = "Frequency") +
  scale_y_continuous(labels = comma) + 
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, size = 15),
        axis.title.x = element_text(size = 12),
        axis.title.y = element_text(size = 12),
        axis.text.x = element_text(size = 10),
        axis.text.y = element_text(size = 10))
## Warning: Removed 44273 rows containing non-finite outside the scale range
## (`stat_bin()`).

This histogram is dominated by zero values, making it difficult to identify clear trends.

# Count missing values in the CFB column
sum(is.na(hotspots_peak$cfb))
## [1] 44273
# Identify years with the most missing values
missing_cfb <- hotspots_peak %>%
  group_by(year) %>%
  summarise(
    total_count = n(),
    missing_count = sum(is.na(cfb)),
    missing_percentage = (missing_count / total_count) * 100
  ) %>%
  arrange(desc(missing_percentage))

missing_cfb
## # A tibble: 10 × 4
##     year total_count missing_count missing_percentage
##    <dbl>       <int>         <int>              <dbl>
##  1  2015       44579         25012               56.1
##  2  2016       11094          5197               46.8
##  3  2014       36152         14064               38.9
##  4  2017      270146             0                0  
##  5  2018      365355             0                0  
##  6  2019       28534             0                0  
##  7  2020       11507             0                0  
##  8  2021      232832             0                0  
##  9  2022       46587             0                0  
## 10  2023      687827             0                0

2015 has the highest percentage of missing values at 56.1%. 2017 has 54.9%. 2016 has 46.8%. 2018-2024 and 2014 have no missing values

zero_cfb <- hotspots_peak %>%
  group_by(year) %>%
  summarise(
    total_count = n(),
    zero_count = sum(cfb == 0, na.rm = TRUE),
    zero_percentage = (zero_count / total_count) * 100
  ) %>%
  arrange(desc(zero_percentage))

zero_cfb
## # A tibble: 10 × 4
##     year total_count zero_count zero_percentage
##    <dbl>       <int>      <int>           <dbl>
##  1  2019       28534      26443         92.7   
##  2  2022       46587      38289         82.2   
##  3  2020       11507       8984         78.1   
##  4  2023      687827     446669         64.9   
##  5  2021      232832     131123         56.3   
##  6  2017      270146     131047         48.5   
##  7  2018      365355     112988         30.9   
##  8  2014       36152         17          0.0470
##  9  2015       44579         17          0.0381
## 10  2016       11094          2          0.0180

2019 has the highest percentage of zero values at 92.7%. 2020, 2021, 2022, and 2023 also have high values ranging from 65.4% to 77.8%. 2015-2018, and 2024 have almost no zero values.

Visualization of CFB after removing NA value

# Filter out zero and na values to check the CFB values when it was properly recorded.

hotspots_peak_CFB <- hotspots_peak %>%
  filter(cfb > 0 & !is.na(cfb))

# Create the histogram for CFB
ggplot(hotspots_peak_CFB, aes(x = cfb)) +
  geom_histogram(binwidth = 1, fill = "steelblue", color = "black") +
  labs(title = "Distribution of Crown Fraction Burned (CFB) at Fire Hotspots",
       x = "CFB",
       y = "Frequency") +
  theme_minimal() + 
  scale_y_continuous(labels = scales::comma)

“age” (Age of the Tree)

# The age is indicated the age of tree were burned
# Count missing values in the Age column
sum(is.na(hotspots_peak$age))
## [1] 1350060
# Identify years with the most missing values
missing_age <- hotspots_peak %>%
  group_by(year) %>%
  summarise(
    total_count = n(),
    missing_count = sum(is.na(age)),
    missing_percentage = (missing_count / total_count) * 100
  ) %>%
  arrange(desc(missing_percentage))

missing_age
## # A tibble: 10 × 4
##     year total_count missing_count missing_percentage
##    <dbl>       <int>         <int>              <dbl>
##  1  2019       28534         28534              100  
##  2  2020       11507         11507              100  
##  3  2021      232832        232832              100  
##  4  2022       46587         46587              100  
##  5  2023      687827        687827              100  
##  6  2018      365355        342773               93.8
##  7  2014       36152             0                0  
##  8  2015       44579             0                0  
##  9  2016       11094             0                0  
## 10  2017      270146             0                0
zero_age <- hotspots_peak %>%
  group_by(year) %>%
  summarise(
    total_count = n(),
    zero_count = sum(age == 0, na.rm = TRUE),
    zero_percentage = (zero_count / total_count) * 100
  ) %>%
  arrange(desc(zero_percentage))

zero_age
## # A tibble: 10 × 4
##     year total_count zero_count zero_percentage
##    <dbl>       <int>      <int>           <dbl>
##  1  2017      270146     255509            94.6
##  2  2014       36152          0             0  
##  3  2015       44579          0             0  
##  4  2016       11094          0             0  
##  5  2018      365355          0             0  
##  6  2019       28534          0             0  
##  7  2020       11507          0             0  
##  8  2021      232832          0             0  
##  9  2022       46587          0             0  
## 10  2023      687827          0             0

The years with valid amount of observations are 2014 2015 2016. The variable almost doesnt have meaningfull values in 2017 and 2018. The more recent hotspots have stopped recording this variable altogether.

Visualization of age variable

hotspots_peak_age <- hotspots_peak %>%
  filter(age > 0 & !is.na(age))

# Create the histogram for Age
ggplot(hotspots_peak_age, aes(x = age)) +
  geom_histogram(binwidth = 100, fill = "steelblue", color = "black") +
  labs(title = "Distribution of Tree Age at Fire Hotspots",
       x = "Age",
       y = "Frequency") +
  theme_minimal() + 
  scale_y_continuous(labels = scales::comma)

“est area” (Estimated Area Burned )

# approximate burned area based on historical average area burned per hotspot by agency and fuel type
# Count missing values in the estarea
sum(is.na(hotspots_peak$estarea))
## [1] 1223385
# Identify years with the most missing values

missing_estarea <- hotspots_peak %>%
  group_by(year) %>%
  summarise(
    total_count = n(),
    missing_count = sum(is.na(estarea)),
    missing_percentage = (missing_count / total_count) * 100
  ) %>%
  arrange(desc(year))

missing_estarea
## # A tibble: 10 × 4
##     year total_count missing_count missing_percentage
##    <dbl>       <int>         <int>              <dbl>
##  1  2023      687827        687827              100  
##  2  2022       46587         46587              100  
##  3  2021      232832        232832              100  
##  4  2020       11507          9827               85.4
##  5  2019       28534         18305               64.2
##  6  2018      365355        200009               54.7
##  7  2017      270146             0                0  
##  8  2016       11094          5880               53.0
##  9  2015       44579         22118               49.6
## 10  2014       36152             0                0

2023 2022 and 2021 do not have this variable. 2015 2016 2018 2019 has around 50% missing values 2020 85 % missing values

Summary of zero value in estarea variable

zero_estarea <- hotspots_peak %>%
  group_by(year) %>%
  summarise(
    total_count = n(),
    zero_count = sum(estarea == 0, na.rm = TRUE),
    zero_percentage = (zero_count / total_count) * 100
  ) %>%
  arrange(desc(year))

zero_estarea
## # A tibble: 10 × 4
##     year total_count zero_count zero_percentage
##    <dbl>       <int>      <int>           <dbl>
##  1  2023      687827          0          0     
##  2  2022       46587          0          0     
##  3  2021      232832          0          0     
##  4  2020       11507          9          0.0782
##  5  2019       28534         58          0.203 
##  6  2018      365355          0          0     
##  7  2017      270146     140895         52.2   
##  8  2016       11094          0          0     
##  9  2015       44579          0          0     
## 10  2014       36152          0          0

Additionally to the missing values already described, 2017 has almost 50% zero values The only year with values present is 2014 The recent hotspots have stopped recording this variable altogether.

“polyid” variable

# Count missing values in the estarea
sum(is.na(hotspots_peak$polyid))
## [1] 1384975
# Identify years with the most missing values
missing_polyid <- hotspots_peak %>%
  group_by(year) %>%
  summarise(
    total_count = n(),
    missing_count = sum(is.na(polyid)),
    missing_percentage = (missing_count / total_count) * 100
  ) %>%
  arrange(desc(year))

missing_polyid
## # A tibble: 10 × 4
##     year total_count missing_count missing_percentage
##    <dbl>       <int>         <int>              <dbl>
##  1  2023      687827        687827             100   
##  2  2022       46587         46587             100   
##  3  2021      232832        232832             100   
##  4  2020       11507         11507             100   
##  5  2019       28534         28534             100   
##  6  2018      365355        365355             100   
##  7  2017      270146             0               0   
##  8  2016       11094          3193              28.8 
##  9  2015       44579          7984              17.9 
## 10  2014       36152          1156               3.20

There is no polyid variable in year 2018-2023.

zero_polyid <- hotspots_peak %>%
  group_by(year) %>%
  summarise(
    total_count = n(),
    zero_count = sum(polyid == 0, na.rm = TRUE),
    zero_percentage = (zero_count / total_count) * 100
  ) %>%
  arrange(desc(year))

zero_polyid
## # A tibble: 10 × 4
##     year total_count zero_count zero_percentage
##    <dbl>       <int>      <int>           <dbl>
##  1  2023      687827          0             0  
##  2  2022       46587          0             0  
##  3  2021      232832          0             0  
##  4  2020       11507          0             0  
##  5  2019       28534          0             0  
##  6  2018      365355          0             0  
##  7  2017      270146     267596            99.1
##  8  2016       11094          0             0  
##  9  2015       44579          0             0  
## 10  2014       36152          0             0

Additionally 2017 has almost all zero value.

Initially the variable was used to identify specific event, but recently the reporting process has changed Only meaningful information in 2014 2015 and 2016, it may be not suitable for meaningful analyses for the 10 year period

“pcuring” (Percent Curing)

# This indicates the proportion of grass and other fuels (non-woody) that are in a cured (dried) state, ready to burn.
# For example, if pcuring is 80%, it means that 80% of the vegetation is dead or dry enough to ignite and sustain fire.

# Count missing values in the pcuring
sum(is.na(hotspots_peak$pcuring))
## [1] 687686
# Identify years with the most missing values
missing_pcuring <- hotspots_peak %>%
  group_by(year) %>%
  summarise(
    total_count = n(),
    missing_count = sum(is.na(pcuring)),
    missing_percentage = (missing_count / total_count) * 100
  ) %>%
  arrange(desc(year))

missing_pcuring
## # A tibble: 10 × 4
##     year total_count missing_count missing_percentage
##    <dbl>       <int>         <int>              <dbl>
##  1  2023      687827        687686               100.
##  2  2022       46587             0                 0 
##  3  2021      232832             0                 0 
##  4  2020       11507             0                 0 
##  5  2019       28534             0                 0 
##  6  2018      365355             0                 0 
##  7  2017      270146             0                 0 
##  8  2016       11094             0                 0 
##  9  2015       44579             0                 0 
## 10  2014       36152             0                 0

2023 does not have this variable.

Analyse pcuring without year of 2023

# Exclude the year 2023 where pcuring is missing
hotspots_peak_pcuring <- hotspots_peak %>%
  filter(!(year == 2023)) %>%
  filter(!is.na(pcuring))

# Check the summary of the filtered data
summary(hotspots_peak_pcuring$pcuring)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   21.00   36.00   37.37   50.00  125.00
# Checking for values greater than 100%
pcuring_above_100 <- hotspots_peak %>% filter(pcuring > 100)
summary(pcuring_above_100)
##       lat             lon            rep_date                  
##  Min.   :50.27   Min.   :-134.4   Min.   :2018-08-01 03:10:00  
##  1st Qu.:55.20   1st Qu.:-132.1   1st Qu.:2018-08-09 02:21:15  
##  Median :57.27   Median :-131.2   Median :2018-08-17 05:12:30  
##  Mean   :56.06   Mean   :-128.5   Mean   :2018-11-02 22:40:00  
##  3rd Qu.:58.12   3rd Qu.:-127.6   3rd Qu.:2018-11-11 01:31:15  
##  Max.   :59.44   Max.   :-117.2   Max.   :2019-07-10 05:05:00  
##                                                                
##       uid             source             sensor           satellite        
##  Min.   :3033797   Length:4           Length:4           Length:4          
##  1st Qu.:4125436   Class :character   Class :character   Class :character  
##  Median :5217074   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :4527720                                                           
##  3rd Qu.:5274682                                                           
##  Max.   :5332290                                                           
##  NA's   :1                                                                 
##     agency               temp             rh              ws       
##  Length:4           Min.   :13.92   Min.   :39.00   Min.   :2.757  
##  Class :character   1st Qu.:15.98   1st Qu.:44.25   1st Qu.:2.812  
##  Mode  :character   Median :18.76   Median :49.00   Median :2.924  
##                     Mean   :18.43   Mean   :48.25   Mean   :3.022  
##                     3rd Qu.:21.21   3rd Qu.:53.00   3rd Qu.:3.134  
##                     Max.   :22.29   Max.   :56.00   Max.   :3.482  
##                                                                    
##        wd             pcp               ffmc            dmc        
##  Min.   :237.0   Min.   :0.00000   Min.   :83.92   Min.   : 32.17  
##  1st Qu.:243.0   1st Qu.:0.03375   1st Qu.:86.59   1st Qu.: 45.81  
##  Median :248.5   Median :0.20450   Median :88.80   Median : 70.22  
##  Mean   :259.8   Mean   :0.30175   Mean   :88.02   Mean   : 71.45  
##  3rd Qu.:265.2   3rd Qu.:0.47250   3rd Qu.:90.23   3rd Qu.: 95.86  
##  Max.   :305.0   Max.   :0.79800   Max.   :90.54   Max.   :113.18  
##                                                                    
##        dc             isi             bui             fwi       
##  Min.   :301.9   Min.   :3.149   Min.   : 50.8   Min.   :16.05  
##  1st Qu.:346.0   1st Qu.:4.528   1st Qu.: 68.7   1st Qu.:17.26  
##  Median :466.4   Median :6.093   Median :108.0   Median :20.31  
##  Mean   :513.7   Mean   :6.011   Mean   :104.6   Mean   :20.36  
##  3rd Qu.:634.1   3rd Qu.:7.575   3rd Qu.:143.9   3rd Qu.:23.41  
##  Max.   :820.0   Max.   :8.711   Max.   :151.4   Max.   :24.77  
##                                                                 
##      fuel                ros              sfc              tfc        
##  Length:4           Min.   : 0.000   Min.   :0.0000   Min.   :0.0000  
##  Class :character   1st Qu.: 0.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Mode  :character   Median : 0.218   Median :0.1750   Median :0.1750  
##                     Mean   : 2.609   Mean   :0.3669   Mean   :0.3669  
##                     3rd Qu.: 2.827   3rd Qu.:0.5419   3rd Qu.:0.5419  
##                     Max.   :10.000   Max.   :1.1174   Max.   :1.1174  
##                                                                       
##       bfc              hfi            cfb         age         estarea     
##  Min.   :0.0000   Min.   :   0   Min.   :0   Min.   :255   Min.   :7.516  
##  1st Qu.:0.0000   1st Qu.:   0   1st Qu.:0   1st Qu.:255   1st Qu.:7.719  
##  Median :0.1578   Median :  73   Median :0   Median :255   Median :7.923  
##  Mean   :0.8912   Mean   : 299   Mean   :0   Mean   :255   Mean   :7.923  
##  3rd Qu.:1.0490   3rd Qu.: 372   3rd Qu.:0   3rd Qu.:255   3rd Qu.:8.126  
##  Max.   :3.2490   Max.   :1050   Max.   :0   Max.   :255   Max.   :8.329  
##                                              NA's   :3     NA's   :2      
##      polyid       pcuring         cfactor     greenup       elev     
##  Min.   : NA   Min.   :104.0   Min.   :1   Min.   :1   Min.   :1127  
##  1st Qu.: NA   1st Qu.:113.8   1st Qu.:1   1st Qu.:1   1st Qu.:1654  
##  Median : NA   Median :118.0   Median :1   Median :1   Median :1898  
##  Mean   :NaN   Mean   :116.2   Mean   :1   Mean   :1   Mean   :1913  
##  3rd Qu.: NA   3rd Qu.:120.5   3rd Qu.:1   3rd Qu.:1   3rd Qu.:2157  
##  Max.   : NA   Max.   :125.0   Max.   :1   Max.   :1   Max.   :2727  
##  NA's   :4                     NA's   :1                             
##       cfl         tfc0             sfl        ecozone        sfc0     
##  Min.   :0   Min.   :0.0000   Min.   :-1   Min.   :12   Min.   :0.35  
##  1st Qu.:0   1st Qu.:0.1750   1st Qu.:-1   1st Qu.:12   1st Qu.:0.35  
##  Median :0   Median :0.3500   Median :-1   Median :12   Median :0.35  
##  Mean   :0   Mean   :0.4891   Mean   :-1   Mean   :12   Mean   :0.35  
##  3rd Qu.:0   3rd Qu.:0.7337   3rd Qu.:-1   3rd Qu.:12   3rd Qu.:0.35  
##  Max.   :0   Max.   :1.1174   Max.   :-1   Max.   :12   Max.   :0.35  
##              NA's   :1        NA's   :3    NA's   :3    NA's   :3     
##       cbh          year          month   event_cluster  
##  Min.   :-1   Min.   :2018   Aug    :3   Min.   :    0  
##  1st Qu.:-1   1st Qu.:2018   Jul    :1   1st Qu.:    0  
##  Median :-1   Median :2018   Jan    :0   Median : 4659  
##  Mean   :-1   Mean   :2018   Feb    :0   Mean   : 4970  
##  3rd Qu.:-1   3rd Qu.:2018   Mar    :0   3rd Qu.: 9629  
##  Max.   :-1   Max.   :2019   Apr    :0   Max.   :10563  
##  NA's   :3                   (Other):0

4 entries with values above 100% - out of range values, possible mistake in reporting

Visualization of curing percentage

zero_pcuring <- hotspots_peak %>%
  group_by(year) %>%
  summarise(
    total_count = n(),
    zero_count = sum(pcuring == 0, na.rm = TRUE),
    zero_percentage = (zero_count / total_count) * 100
  ) %>%
  arrange(desc(year))

zero_pcuring
## # A tibble: 10 × 4
##     year total_count zero_count zero_percentage
##    <dbl>       <int>      <int>           <dbl>
##  1  2023      687827          0          0     
##  2  2022       46587         15          0.0322
##  3  2021      232832       5266          2.26  
##  4  2020       11507         12          0.104 
##  5  2019       28534        342          1.20  
##  6  2018      365355      51623         14.1   
##  7  2017      270146       3008          1.11  
##  8  2016       11094        417          3.76  
##  9  2015       44579        783          1.76  
## 10  2014       36152        773          2.14
sum(zero_pcuring$zero_count)
## [1] 62239
# 62239 zero values

Distribution of curing percentage

# Create the histogram for pcuring
ggplot(hotspots_peak_pcuring, aes(x = pcuring)) +
  geom_histogram(binwidth = 1, fill = "skyblue", color = "black", alpha = 0.7) +
  labs(title = "Distribution of Curing Percentage (pcuring) Values", 
       x = "Curing Percentage (pcuring)", 
       y = "Frequency") +
  scale_y_continuous(labels = scales::comma) + 
  theme_minimal()

# Create the boxplot for pcuring across years
ggplot(hotspots_peak_pcuring, aes(x = factor(year), y = pcuring)) +
  geom_boxplot(fill = "steelblue", color = "black", alpha = 0.7) +
  labs(title = "Distribution of Curing Percentage (pcuring) Across Years",
       x = "Year",
       y = "Curing Percentage (pcuring)") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, size = 15),
        axis.title.x = element_text(size = 12),
        axis.title.y = element_text(size = 12),
        axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
        axis.text.y = element_text(size = 10))

The median pcuring values for these years range between 30% and 50%.There are many entries with pcuring at 0%, possibly indicating periods with no drying or data recording issues.

“cfactor”(Curing Factor)

# The amount of curing, or drying, of vegetation.

# Count missing values in the cfactor
sum(is.na(hotspots_peak$cfactor))
## [1] 1007287
# Identify years with the most missing values
missing_cfactor <- hotspots_peak %>%
  group_by(year) %>%
  summarise(
    total_count = n(),
    missing_count = sum(is.na(cfactor)),
    missing_percentage = (missing_count / total_count) * 100
  ) %>%
  arrange(desc(year))

missing_cfactor
## # A tibble: 10 × 4
##     year total_count missing_count missing_percentage
##    <dbl>       <int>         <int>              <dbl>
##  1  2023      687827        687827                100
##  2  2022       46587         46587                100
##  3  2021      232832        232832                100
##  4  2020       11507         11507                100
##  5  2019       28534         28534                100
##  6  2018      365355             0                  0
##  7  2017      270146             0                  0
##  8  2016       11094             0                  0
##  9  2015       44579             0                  0
## 10  2014       36152             0                  0

2019 - 2023 does not have this variable.

Analyse of Curing factor after years removal

# Exclude the years where cfactor is missing
hotspots_peak_cfactor <- hotspots_peak %>%
  filter((year < 2019)) %>%
  filter(!is.na(cfactor))

# Check the summary of the filtered data
summary(hotspots_peak_cfactor$cfactor)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0100  0.0400  0.1306  0.1200  1.0000

Represents the curing factor as a proportion rather than a percentage.

Visualization of Curing Factor

# Create the histogram for cfactor
ggplot(hotspots_peak_cfactor, aes(x = cfactor)) +
  geom_histogram(binwidth = 0.1, fill = "skyblue", color = "black", alpha = 0.7) +
  labs(title = "Distribution of Curing Factor (cfactor) Values", 
       x = "Curing Factor (cfactor)", 
       y = "Frequency") +
  scale_y_continuous(labels = scales::comma) + 
  theme_minimal()

# Create the boxplot for cfactor across years
ggplot(hotspots_peak_cfactor, aes(x = factor(year), y = cfactor)) +
  geom_boxplot(fill = "steelblue", color = "black", alpha = 0.7) +
  labs(title = "Distribution of Curing Factor (cfactor) Across Years",
       x = "Year",
       y = "Curing Factor (cfactor)") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, size = 15),
        axis.title.x = element_text(size = 12),
        axis.title.y = element_text(size = 12),
        axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
        axis.text.y = element_text(size = 10))

The variable has stopped being recorded recently. It may be not suitable for meaningful analyses for the 10 year period

“greenup” (phenological state of deciduous trees)

# phenological state of deciduous trees (0=leafless, 1=green)
sum(is.na(hotspots_peak$greenup))
## [1] 687686
# Identify years with the most missing values
missing_greenup <- hotspots_peak %>%
  group_by(year) %>%
  summarise(
    total_count = n(),
    missing_count = sum(is.na(greenup)),
    missing_percentage = (missing_count / total_count) * 100
  ) %>%
  arrange(desc(year))

missing_greenup
## # A tibble: 10 × 4
##     year total_count missing_count missing_percentage
##    <dbl>       <int>         <int>              <dbl>
##  1  2023      687827        687686               100.
##  2  2022       46587             0                 0 
##  3  2021      232832             0                 0 
##  4  2020       11507             0                 0 
##  5  2019       28534             0                 0 
##  6  2018      365355             0                 0 
##  7  2017      270146             0                 0 
##  8  2016       11094             0                 0 
##  9  2015       44579             0                 0 
## 10  2014       36152             0                 0

2023 does not have this variable.

# Count the number of observations in each greenup state
greenup_count <- hotspots_peak %>%
  group_by(greenup) %>%
  summarise(count = n())
greenup_count
## # A tibble: 4 × 2
##   greenup  count
##     <dbl>  <int>
## 1      -1      4
## 2       0 412757
## 3       1 634166
## 4      NA 687686

There are invalid values present in the dataset. This variable is only 0 or 1

Summary of greenup variable

# Filter out invalid values in the greenup variable
hotspots_peak_greenup <- hotspots_peak %>%
  filter(greenup %in% c(0, 1))

# Check the summary of the filtered data
summary(hotspots_peak_greenup$greenup)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  1.0000  0.6057  1.0000  1.0000
greenup_by_year <- hotspots_peak_greenup %>%
  group_by(year, greenup) %>%
  summarise(count = n(), .groups = 'drop') %>%
  arrange(desc(year), greenup)

Visualization of Greenup variable

# Visualize the count of observations in each greenup state
ggplot(greenup_by_year, aes(x = factor(greenup), y = count, fill = factor(greenup))) +
  geom_bar(stat = "identity") +
  labs(title = "Count of Observations by Greenup State",
       x = "Greenup State",
       y = "Count",
       fill = "Greenup State") +
  scale_fill_manual(values = c("0" = "bisque3", "1" = "lightgreen")) +
  theme_minimal()+
  scale_y_continuous(labels = scales::comma) 

There are significantly more observations for greenup state 1 (trees with green leaves) compared to state 0 (leafless trees).The greenup state alone may not provide significant insights into fire behavior or risks.

“elev” (Elevation above sea level)

# BC's elevation ranges from sea level to approximately 4000 meters,
# with the highest peak being Mount Fairweather at 4663 meters.
# The elev can influencing local weather pattern, fuel moisture

# Check for missing values
sum(is.na(hotspots_peak$elev))
## [1] 36148

A low percentage of missing values

range(hotspots_peak$elev, na.rm = TRUE)
## [1]   -1 3129

-1 3129 While there is one instance of a below sea level elevation, the range is valid for BC

# Identify and analyze outliers
elev_outliers <- hotspots_peak %>%
  filter(elev < 0)
print(elev_outliers)
## # A tibble: 314 × 42
##      lat   lon rep_date                 uid source sensor satellite agency  temp
##    <dbl> <dbl> <dttm>                 <dbl> <chr>  <chr>  <chr>     <chr>  <dbl>
##  1  49.0 -120. 2015-07-05 11:00:00 11188377 NASA   MODIS  Aqua      BC      33.4
##  2  49.1 -125. 2015-10-20 21:06:00 11787219 NOAA   AVHRR  NOAA-19   BC      13.8
##  3  54.0 -129. 2015-07-01 20:44:00 11073822 USFS   MODIS  Terra     BC      21.2
##  4  54.0 -129. 2015-07-01 20:44:00 11073821 USFS   MODIS  Terra     BC      21.2
##  5  49.0 -124. 2015-05-17 21:25:00 10692271 NASA   MODIS  Aqua      BC      15.9
##  6  49   -120. 2015-07-03 14:29:00 11137964 NOAA   AVHRR  NOAA-15   BC      35.8
##  7  51.0 -127. 2015-08-03 04:15:00 11470837 NOAA   AVHRR  METOP-A   BC      17.6
##  8  49.0 -120. 2015-07-03 09:20:00 11159191 USFS   IBAND  S-NPP     BC      35.8
##  9  49.0 -120. 2015-07-03 09:20:00 11159192 USFS   IBAND  S-NPP     BC      35.8
## 10  53.6 -132. 2015-10-12 19:49:00 11759426 USFS   Lands… L8        BC      12.8
## # ℹ 304 more rows
## # ℹ 33 more variables: rh <dbl>, ws <dbl>, wd <dbl>, pcp <dbl>, ffmc <dbl>,
## #   dmc <dbl>, dc <dbl>, isi <dbl>, bui <dbl>, fwi <dbl>, fuel <chr>,
## #   ros <dbl>, sfc <dbl>, tfc <dbl>, bfc <dbl>, hfi <dbl>, cfb <dbl>,
## #   age <dbl>, estarea <dbl>, polyid <dbl>, pcuring <dbl>, cfactor <dbl>,
## #   greenup <dbl>, elev <dbl>, cfl <dbl>, tfc0 <dbl>, sfl <dbl>, ecozone <dbl>,
## #   sfc0 <dbl>, cbh <dbl>, year <dbl>, month <ord>, event_cluster <int>
# Histogram of elevation values
ggplot(hotspots_peak, aes(x = elev)) +
  geom_histogram(binwidth = 100, fill = "skyblue", color = "black") +
  labs(title = "Distribution of Elevation Values",
       x = "Elevation (meters)",
       y = "Frequency") +
  theme_minimal()
## Warning: Removed 36148 rows containing non-finite outside the scale range
## (`stat_bin()`).

Overall the distribution of elevation levels is normal for British Columbia.

“ecozone” Ecozone in which hotspot is located

# Identify years with the most missing values
missing_ecozone <- hotspots_peak %>%
  group_by(year) %>%
  summarise(
    total_count = n(),
    missing_count = sum(is.na(ecozone)),
    missing_percentage = (missing_count / total_count) * 100
  ) %>%
  arrange(desc(year))

missing_ecozone
## # A tibble: 10 × 4
##     year total_count missing_count missing_percentage
##    <dbl>       <int>         <int>              <dbl>
##  1  2023      687827            24            0.00349
##  2  2022       46587             0            0      
##  3  2021      232832             0            0      
##  4  2020       11507             4            0.0348 
##  5  2019       28534             0            0      
##  6  2018      365355        365355          100      
##  7  2017      270146        270146          100      
##  8  2016       11094         11094          100      
##  9  2015       44579         44579          100      
## 10  2014       36152         36152          100

For the years 2019-2023 there are only 24+4 missing entries.

# Count the number of hotspots in each ecozone
ecozone_counts <- hotspots_peak %>% filter(year >= 2019) %>% 
  group_by(ecozone) %>%
  summarise(count = n())

# Print the ecozone counts
print(ecozone_counts)
## # A tibble: 6 × 2
##   ecozone  count
##     <dbl>  <int>
## 1       4 321952
## 2       9  54214
## 3      12  40810
## 4      13  19171
## 5      14 571112
## 6      NA     28
# Convert ecozone to a factor
ecozone_counts$ecozone <- factor(ecozone_counts$ecozone, levels = c(14, 4, 9, 12, 13, "NA"))
# Define custom colors and labels for the selected ecozones (Statistics Canada site)
# https://www.statcan.gc.ca/en/subjects/standard/environment/elc/2017-map
custom_colors <- c("14" = "#B5D79F", "4" = "#989898", "9" = "#36C48E", 
                   "12" = "#ACC32D", "13" = "#05734D", "NA" = "#000000")

custom_labels <- c("14" = "Montane Cordillera", "4" = "Taiga Plains", "9" = "Boreal Plains",
                   "12" = "Boreal Cordillera", "13" = "Pacific Maritime")

# Visualize the distribution of hotspots by ecozone with custom legend
ggplot(ecozone_counts, aes(x = reorder(ecozone, -count), y = count, fill = ecozone)) +
  geom_bar(stat = "identity") +
  labs(title = "Distribution of Hotspots by Ecozone (2019 onwards)",
       x = "Ecozone",
       y = "Number of Hotspots",
       fill = "Ecozone") +
  theme_minimal() +
  scale_y_continuous(labels = scales::comma) +
  scale_fill_manual(values = custom_colors, labels = custom_labels) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), 
        legend.position = "right", 
        legend.title = element_text(size = 12),
        legend.text = element_text(size = 10))

The plot shows distribution of ecozones in the hotspot dataset.

The plot shows that the Montane Cordillera ecozone has the highest number of hotspots, indicating it is the most fire-prone ecozone in the dataset from 2019 onwards.

For the variables : cfl, sfl,tfc0, sfc0, cbh are not suitable for analysis of the decade as data present only in recent years